맨날 다른 EDA 하지만 공통적으로 먹히는 템플릿이 있긴있는것 같다.
https://www.kaggle.com/ashishpatel26/predict-sales-price-using-xgboost 여기서 가지고 왔다.
# 각종 정보 보기
def eda(data): print("----------Top-5- Record----------") print(data.head(5)) print("-----------Information-----------") print(data.info()) print("-----------Data Types-----------") print(data.dtypes) print("----------Missing value-----------") print(data.isnull().sum()) print("----------Null value-----------") print(data.isna().sum()) print("----------Shape of Data----------") print(data.shape)
# 숫자형 데이터 histogram으로 보기 def graph_insight(data): print(set(data.dtypes.tolist())) df_num = data.select_dtypes(include = ['float64', 'int64']) df_num.hist(figsize=(16, 16), bins=50, xlabelsize=8, ylabelsize=8);
# 중복제거
def drop_duplicate(data, subset): print('Before drop shape:', data.shape) before = data.shape[0] data.drop_duplicates(subset,keep='first', inplace=True) #subset is list where you have to put all column for duplicate check data.reset_index(drop=True, inplace=True) print('After drop shape:', data.shape) after = data.shape[0] print('Total Duplicate:', before-after)
# Outlier 제거 (데이터 손실 주의!!)
# feature에 target값같은거 잘 제외하고 for문돌리기
def remove_outlier(data, feature): # float형에다가 하기, 반환안해도 자동으로 적용됨 print(F"Feature : {feature}") q25, q75 = np.percentile(data[feature].values, 25), np.percentile(data[feature].values, 75) print(F"Quartile 25: {q25}, Quatile 75: {q75}") iqr = q75 - q25 print('IQR: {iqr}') cutoff = iqr * 1.5 lower, upper = q25 - cutoff, q75 + cutoff print('Cut off: {}'.format(cutoff) ) print(F"Lower : {lower}, Upper: {upper}") outliers = [x for x in data[feature].values if x<lower or x>upper] print(F"There are {len(outliers)} outliers") data.drop( data[ (data[feature] > upper) | (data[feature]<lower) ].index, inplace=True ) print('\n')
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
# Train, test셋이 같은 data에 있어야함. 왜냐하면 따로하면 따로 fit되기 때문.
# 스케일링용 def scaling(data, feature ,kind_of_scaler): scaler = None if( kind_of_scaler == 'Standard' ): scaler = StandardScaler() elif( kind_of_scaler == 'MinMax' ): scaler = MinMaxScaler() elif( kind_of_scaler == 'MaxAbs' ): scaler = MaxAbsScaler() elif( kind_of_scaler == 'Robust' ): scaler = RobustScaler() elif( kind_of_scaler == 'Log' ): scaler = None # 아직 미정 # Test, Train set이 나누어져 있다면 하나로 합쳐서 넣든가 새로 해야함 data[feature] = scaler.fit_transform(data[feature].values.reshape(-1,1)) # score 출력용
# the function that we will use to better evalueate the model
from sklearn.metrics import precision_score, recall_score, fbeta_score, confusion_matrix, precision_recall_curve, accuracy_scoredef print_results(headline, true_value, pred): print(headline) print(F"accuracy: {accuracy_score(true_value, pred)}") print(F"precision: {precision_score(true_value, pred)}") print(F"recall: {recall_score(true_value, pred)}") print(F"f2: {fbeta_score(true_value, pred, beta=2)}")
'개발 > ML+ Data Science' 카테고리의 다른 글
Imbalance 한 dataset에서의 실수 및 방법 (0) | 2019.08.20 |
---|---|
머신러닝 분석에 필요한 단계들 (수정중) (0) | 2019.08.20 |
머신러닝 지침! (0) | 2019.08.14 |
머신러닝 분석 단계 (0) | 2019.08.13 |
Gini(지니) 계수란 뭘까? (0) | 2019.07.31 |