Thief of Wealth
Published 2019. 8. 16. 16:03
머신러닝 template 개발/ML+ Data Science

맨날 다른 EDA 하지만 공통적으로 먹히는 템플릿이 있긴있는것 같다.


https://www.kaggle.com/ashishpatel26/predict-sales-price-using-xgboost 여기서 가지고 왔다.



# 각종 정보 보기

def eda(data): print("----------Top-5- Record----------") print(data.head(5)) print("-----------Information-----------") print(data.info()) print("-----------Data Types-----------") print(data.dtypes) print("----------Missing value-----------") print(data.isnull().sum()) print("----------Null value-----------") print(data.isna().sum()) print("----------Shape of Data----------") print(data.shape)

# 숫자형 데이터 histogram으로 보기 def graph_insight(data): print(set(data.dtypes.tolist())) df_num = data.select_dtypes(include = ['float64', 'int64']) df_num.hist(figsize=(16, 16), bins=50, xlabelsize=8, ylabelsize=8);

# 중복제거

def drop_duplicate(data, subset): print('Before drop shape:', data.shape) before = data.shape[0] data.drop_duplicates(subset,keep='first', inplace=True) #subset is list where you have to put all column for duplicate check data.reset_index(drop=True, inplace=True) print('After drop shape:', data.shape) after = data.shape[0] print('Total Duplicate:', before-after)


# Outlier 제거 (데이터 손실 주의!!)

# feature에 target값같은거 잘 제외하고 for문돌리기

def remove_outlier(data, feature): # float형에다가 하기, 반환안해도 자동으로 적용됨 print(F"Feature : {feature}") q25, q75 = np.percentile(data[feature].values, 25), np.percentile(data[feature].values, 75) print(F"Quartile 25: {q25}, Quatile 75: {q75}") iqr = q75 - q25 print('IQR: {iqr}') cutoff = iqr * 1.5 lower, upper = q25 - cutoff, q75 + cutoff print('Cut off: {}'.format(cutoff) ) print(F"Lower : {lower}, Upper: {upper}") outliers = [x for x in data[feature].values if x<lower or x>upper] print(F"There are {len(outliers)} outliers") data.drop( data[ (data[feature] > upper) | (data[feature]<lower) ].index, inplace=True ) print('\n')

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

# Train, test셋이 같은 data에 있어야함. 왜냐하면 따로하면 따로 fit되기 때문.

# 스케일링용 def scaling(data, feature ,kind_of_scaler): scaler = None if( kind_of_scaler == 'Standard' ): scaler = StandardScaler() elif( kind_of_scaler == 'MinMax' ): scaler = MinMaxScaler() elif( kind_of_scaler == 'MaxAbs' ): scaler = MaxAbsScaler() elif( kind_of_scaler == 'Robust' ): scaler = RobustScaler() elif( kind_of_scaler == 'Log' ): scaler = None # 아직 미정 # Test, Train set이 나누어져 있다면 하나로 합쳐서 넣든가 새로 해야함 data[feature] = scaler.fit_transform(data[feature].values.reshape(-1,1)) # score 출력

# the function that we will use to better evalueate the model

from sklearn.metrics import precision_score, recall_score, fbeta_score, confusion_matrix, precision_recall_curve, accuracy_score

def print_results(headline, true_value, pred): print(headline) print(F"accuracy: {accuracy_score(true_value, pred)}") print(F"precision: {precision_score(true_value, pred)}") print(F"recall: {recall_score(true_value, pred)}") print(F"f2: {fbeta_score(true_value, pred, beta=2)}")


profile on loading

Loading...