[필사] Costa Rican Household Poverty Level Prediction

머신러닝/캐글 2021. 10. 31. 23:00

# 코드 1

import numpy as np # linear algebra
import pandas as pd 

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import f1_score
from joblib import Parallel, delayed
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.utils import class_weight

import warnings
warnings.filterwarnings("ignore")

from sklearn.externals.joblib import Parallel, delayed 를

from joblib import Parallel, delayed 이렇게 바꾸기

# 코드 2

from sklearn.preprocessing import LabelEncoder

# this only transforms the idhogar field, the other things this function used to do are done elsewhere
def encode_data(df):
    df['idhogar'] = LabelEncoder().fit_transform(df['idhogar'])

# plot feature importance for sklearn decision trees    
def feature_importance(forest, X_train, display_results=True):
    ranked_list = []
    zero_features = []
    
    importances = forest.feature_importances_

    indices = np.argsort(importances)[::-1]
    
    if display_results:
        # Print the feature ranking
        print("Feature ranking:")

    for f in range(X_train.shape[1]):
        if display_results:
            print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]) + " - " + X_train.columns[indices[f]])
        
        ranked_list.append(X_train.columns[indices[f]])
        
        if importances[indices[f]] == 0.0:
            zero_features.append(X_train.columns[indices[f]])
            
    return ranked_list, zero_features

코드 설명

LabelEncoder : 문자를 0부터 시작하는 정수형 숫자로 바꿔주는 기능

# 라벨 인코더 객체 생성

encoder = LabelEncoder()

# a데이터를 이용해서 피팅, 라벨숫자로 변환해줌

encoder.fit(a)

b= encoder.transform(a)

위의 코드에서 fit_transform 이라고 한건, fit, transform 을 모두 진행해줌

참고 자료 : https://m.blog.naver.com/PostView.naver?isHttpsRedirect=true&blogId=wideeyed&logNo=221592651246

2) feature_impartances_

변수별 중요도 추출

3) np.argsort(a)

array a 를 정렬하는 인덱스의 array를 반환

참고 자료 : https://codetorial.net/tips_and_examples/numpy_argsort.html

4) print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]) + " - " + X_train.columns[indices[f]])

% 뒤의 f + 1, indices[f], importances[indices[f]] 값들이 각각

%d , %d, %f 값으로 들어감

즉,

%d = f+1

%d = indices[f]

%f = importances[indices[f]]

맨마지막의 X_train.columns[indices[f]] 이 값은 그냥 문자열로 표시됨

예를 들어서 아래와 같음

%d: 십진수

%f : 실수

# 코드 3

def do_features(df):
    feats_div = [('children_fraction', 'r4t1', 'r4t3'), 
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2'),
                ]
    
    feats_sub = [('people_not_living', 'tamhog', 'tamviv'),
                 ('people_weird_stat', 'tamhog', 'r4t3')]

    for f_new, f1, f2 in feats_div:
        df['fe_' + f_new] = (df[f1] / df[f2]).astype(np.float32)       
    for f_new, f1, f2 in feats_sub:
        df['fe_' + f_new] = (df[f1] - df[f2]).astype(np.float32)
    
    # aggregation rules over household
    aggs_num = {'age': ['min', 'max', 'mean'],
                'escolari': ['min', 'max', 'mean']
               }
    
    aggs_cat = {'dis': ['mean']}
    for s_ in ['estadocivil', 'parentesco', 'instlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startswith(s_)]:
            aggs_cat[f_] = ['mean', 'count']

    # aggregation over household
    for name_, df_ in [('18', df.query('age >= 18'))]:
        df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)
        df_agg.columns = pd.Index(['agg' + name_ + '_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])
        df = df.join(df_agg, how='left', on='idhogar')
        del df_agg

    # Drop id's
    df.drop(['Id'], axis=1, inplace=True)
    
    return df

  feats_div = [('children_fraction', 'r4t1', 'r4t3'),
                 ('working_man_fraction', 'r4h2', 'r4t3'),
                 ('all_man_fraction', 'r4h3', 'r4t3'),
                 ('human_density', 'tamviv', 'rooms'),
                 ('human_bed_density', 'tamviv', 'bedrooms'),
                 ('rent_per_person', 'v2a1', 'r4t3'),
                 ('rent_per_room', 'v2a1', 'rooms'),
                 ('mobile_density', 'qmobilephone', 'r4t3'),
                 ('tablet_density', 'v18q1', 'r4t3'),
                 ('mobile_adult_density', 'qmobilephone', 'r4t2'),
                 ('tablet_adult_density', 'v18q1', 'r4t2'),
                ]

튜플형식의 리스트를 만듦

() : 튜플

[] : 리스트

children_fraction : 아이들 비율

working_man_fraction ; 노동자 비율

all_man_fraction : (성인)남자? 비율?

human_density : 인구 밀도

human_bed_density : ?

rent_per_person : 1인당 임대료

rent_per_room : 방당 임대료

mobile_density : 핸드폰 사용밀도?

tablet_density : 태블릿 사용밀도?

------------------

feats_div = [('children_fraction', 'r4t1', 'r4t3'),

feats_div 를 이렇게 만들 때,

f_new = children_fraction , f1 =r4t1 , f2=r4t3 이렇게 해서,

df에다가 ['fe_' + f_new] 칼럼을 만들 때, 그 값을 df[f1]/df[f2] 로 넣고, 그 타입은 float32 로 한다.

즉, df[fe_children_fraction] = df[r4t1]/df[r4t3]

    aggs_num = {'age': ['min', 'max', 'mean'],
                'escolari': ['min', 'max', 'mean']
               }

파이썬 dictionary 형태

: 키, 값으로 맵핑된 순서 없는 집합

-> age : 키, ['min', 'max', 'mean'] : 값

    for s_ in ['estadocivil', 'parentesco', 'instlevel']:
        for f_ in [f_ for f_ in df.columns if f_.startswith(s_)]:
            aggs_cat[f_] = ['mean', 'count']

startswith(시작 문자, 시작지점) : 문자열이 특정 문자로 시작하는지 여부 알려줌

즉, df의 column 중에 'estadocivil', 'parentesco', 'instlevel' 이 값들로 시작되는 칼럼이 있다면(예를 들어 estadocivil3)

aggs_cat에 칼럼을 만들면서 그 값을 mean, count로 채워넣는다.

query : 판다스에서 조건에 부합하는 데이터 추출 시 사용

-> 기능

1: 비교(==, > , >=, <, <=,!=)

2: in 연산자(in, == , not in, !=)

3: 논리 연산자(and, or, not)

4: 외부 변수(또는 함수) 참조 연산

5: 인덱스 검색

6: 문자열 부분검색(str.contains, str.startswith, str.endswith)

참고 자료 : https://m.blog.naver.com/wideeyed/221867273249

6) for name_, df_ in [('18', df.query('age >= 18'))]:

name_ = 18

df_ = df.query('age >= 18')

df_agg = df_.groupby('idhogar').agg({**aggs_num, **aggs_cat}).astype(np.float32)

df_.groupby('idhogar')

-> df_ 의 칼럼 idhogar 을 기준으로, idhogar 값이 같은 것끼리 묶음

agg({**aggs_num, **aggs_cat}).astype(np.float32)

-> 그렇게 나뉜 각 그룹에 대해 agg 함수 실행

파이썬에는 *args, **kwargs 가 존재

args는 튜플형태로 저장kwargs는 키-값 형태로 된 인자를 주면 딕셔너리 형태로 저장-> **aggs_num 이렇게 하면,

aggs_num = {'age': ['min', 'max', 'mean'], 'escolari': ['min', 'max', 'mean'] }

이런 형태이기 때문에,

{"age" : min, "age": max, "age":mean}

{"escolari" : min, "escolari" : max, "escolari" : mean}

이런 형태로 결과값을 주는 건가?

참고자료:

https://legitcode267.tistory.com/13

8) df_agg.columns = pd.Index(['agg' + name_ + '_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()])

pd.Index로, ['agg' + name_ + '_' + e[0] + "_" + e[1].upper() for e in df_agg.columns.tolist()] 이 값들을 이용해서 인덱스 만드는 것 같음

그래서 아래 코드 결과값 보면 이런 결과가 나오는 듯 함

df_agg.columns.tolist()

-> tolist() : 리스트 형태로 반환(e[0], e[1])

그러니깐, 위의 그림 첫번째 값에서 e[0]은 estadocivil1 e[1] 은 MEAN

9) del df_agg

del : 인덱스로 삭제

remove() : 값으로 삭제

참고자료 : https://ooyoung.tistory.com/49

# 코드 4

# convert one hot encoded fields to label encoding
def convert_OHE2LE(df):
    tmp_df = df.copy(deep=True)
    for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 
               'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 
               'instlevel', 'lugar', 'tipovivi',
               'manual_elec']:
        if 'manual_' not in s_:
            cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]
        elif 'elec' in s_:
            cols_s_ = ['public', 'planpri', 'noelec', 'coopele']
        sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
        #deal with those OHE, where there is a sum over columns == 0
        if 0 in sum_ohe:
            print('The OHE in {} is incomplete. A new column will be added before label encoding'
                  .format(s_))
            # dummy colmn name to be added
            col_dummy = s_+'_dummy'
            # add the column to the dataframe
            tmp_df[col_dummy] = (tmp_df[cols_s_].sum(axis=1) == 0).astype(np.int8)
            # add the name to the list of columns to be label-encoded
            cols_s_.append(col_dummy)
            # proof-check, that now the category is complete
            sum_ohe = tmp_df[cols_s_].sum(axis=1).unique()
            if 0 in sum_ohe:
                 print("The category completion did not work")
        tmp_cat = tmp_df[cols_s_].idxmax(axis=1)
        tmp_df[s_ + '_LE'] = LabelEncoder().fit_transform(tmp_cat).astype(np.int16)
        if 'parentesco1' in cols_s_:
            cols_s_.remove('parentesco1')
        tmp_df.drop(cols_s_, axis=1, inplace=True)
    return tmp_df

코드 설명

for s_ in ['pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 'instlevel', 'lugar', 'tipovivi', 'manual_elec']:

if 'manual_' not in s_: cols_s_ = [f_ for f_ in df.columns if f_.startswith(s_)]

-> 즉, s_는 'pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 'instlevel', 'lugar', 'tipovivi', 'manual_elec' 인데,

manual 이 포함되어 있지 않는 값들

: 'pared', 'piso', 'techo', 'abastagua', 'sanitario', 'energcocinar', 'elimbasu', 'epared', 'etecho', 'eviv', 'estadocivil', 'parentesco', 'instlevel', 'lugar', 'tipovivi',

이 값들의 cols_s_ = df의 칼럼중에 s_로 시작하는 칼럼들

elif 'elec' in s_:

cols_s_ = ['public', 'planpri', 'noelec', 'coopele']

s_ 값 중에 elec 라는 단어가 포함되어 있는 즉, 'manual_elec'

이 값의 cols_s는, 'public', 'planpri', 'noelec', 'coopele' 이 된다.

참고자료:

https://www.kaggle.com/skooch/xgboost/notebook

XGBoost

Explore and run machine learning code with Kaggle Notebooks | Using data from Costa Rican Household Poverty Level Prediction

www.kaggle.com

'머신러닝 > 캐글' 카테고리의 다른 글

구글 코랩 사용법 (0)	2021.11.03
중고차 가격 예측 모델 (0)	2021.11.01
[필사 실패] Home Credit Default Risk (0)	2021.10.31
[필사] Porto Seguro Exploratory Analysis and Prediction - 아직 완성 못함 (0)	2021.10.31
Porto Seguro’s Safe Driver Prediction (0)	2021.10.30

ABOUT ME

good day good day

'머신러닝 > 캐글' 카테고리의 다른 글

티스토리툴바

ABOUT ME

'머신러닝 > 캐글' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바