Thief of Wealth


def print_train_test_diff(trainset, testset):

    temp_df = pd.DataFrame()

    for column in list(trainset.columns.values):

        field_type = trainset[column].dtype

        try:

            temp_df = temp_df.append(

                pd.DataFrame(

                    {

                        'column' : column,

                        'train': trainset[column].nunique(),

                        'test' : testset[column].nunique(),

                        'type' : field_type

                    },

                    index = [0]

                )

            )

        except:

            "Error trying to add target from test" # testset에 target값이 없어서 생기는 요류 방지

    temp_df['delta'] = temp_df.train - temp_df.test # count 차이

    temp_df['flag'] = (temp_df['delta'] < 0 ).astype(int) # test가 train 보다 많으면 flag 1

    test_dom_categories = temp_df.loc[(temp_df.flag == 1) & (temp_df.type == 'object'), 'column']

    print(F"Columns of type 'object' and with more categories in test than in train: {list(test_dom_categories)} ")

    temp_df = temp_df.transpose()


    print('Unique column values in identity datasets')

    return (temp_df)





사용: print_train_test_diff(train_identity_df, test_identity_df)

profile on loading

Loading...