{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### 1. Dealing with missing data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pandas import Series, DataFrame\n", "import pandas as pd\n", "from numpy.random import randn\n", "import numpy as np\n", "from collections import Counter\n", "import time\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(478138, 27) (61259, 26)\n", "['instance_id', 'item_id', 'item_category_list', 'item_property_list', 'item_brand_id', 'item_city_id', 'item_price_level', 'item_sales_level', 'item_collected_level', 'item_pv_level', 'user_id', 'user_gender_id', 'user_age_level', 'user_occupation_id', 'user_star_level', 'context_id', 'context_timestamp', 'context_page_id', 'predict_category_property', 'shop_id', 'shop_review_num_level', 'shop_review_positive_rate', 'shop_star_level', 'shop_score_service', 'shop_score_delivery', 'shop_score_description', 'is_trade']\n" ] } ], "source": [ "train = pd.read_table('round1_train.txt',sep = ' ')\n", "test_a = pd.read_table('round1_testa.txt',sep = ' ')\n", "test_b = pd.read_table('round1_testb.txt',sep = ' ')\n", "test = pd.concat([test_a,test_b]) # combine a,b test set\n", "print(train.shape,test.shape)\n", "print(list(train.columns))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# drop duplicate instances in train and test\n", "train.drop_duplicates(inplace = True)\n", "test.drop_duplicates(inplace = True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "positive instances num: 8994\n", "negative instances num: 469117\n" ] } ], "source": [ "print('positive instances num: ',sum(train['is_trade']))\n", "print('negative instances num: ', train.shape[0] - sum(train['is_trade']))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "item_brand_id 473\n", "item_city_id 277\n", "item_sales_level 913\n", "user_gender_id 12902\n", "user_age_level 964\n", "user_occupation_id 964\n", "user_star_level 964\n", "shop_review_positive_rate 7\n", "shop_score_service 59\n", "shop_score_delivery 59\n", "shop_score_description 59\n" ] } ], "source": [ "# how many missing data in each feature?\n", "\n", "# item_brand_id: 473 \n", "# item_city_id: 277 \n", "# item_sales_level: 913 \n", "# user_gender_id: 12902 \n", "# user_age_level: 964 \n", "# user_star_level: 964 \n", "# user_occupation_id: 964 \n", "# shop_review_positive_rate: 7 \n", "# shop_score_service 59\n", "# shop_score_delivery 59\n", "# shop_score_description 59\n", "\n", "for feature in list(train):\n", " missing_num = (train[feature] == -1).sum()\n", " if missing_num > 0:\n", " print(feature, missing_num)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# fill id with the most common one or two\n", "def filling_id(df,feature,missing_num):\n", " cc = Counter(df[feature])\n", " common = cc.most_common(2)\n", " # intutionly choose 5\n", " if common[0][1]/common[1][1] < 5: \n", " total = common[0][1] + common[1][1]\n", " p = [common[0][1]/total, common[1][1]/total]\n", " replace = np.random.choice([common[0][0], common[0][1]],missing_num,p)\n", " df.loc[df[feature] == -1, feature] = replace\n", " else:\n", " df.loc[df[feature] == -1, feature] = common[0][0]\n", " return df\n", "\n", "train= filling_id(train, 'item_brand_id', 473)\n", "train = filling_id(train,'item_city_id', 277)\n", "train = filling_id(train,'user_gender_id',12902)\n", "train = filling_id(train,'user_occupation_id',964)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# filling level with the most common six values\n", "def filling_level(df,feature,missing_num):\n", " cc = Counter(df[feature])\n", " # intutionly choose 6\n", " common = cc.most_common(6)\n", " total = 0\n", " p = []\n", " replace_list = []\n", " for i in common:\n", " total += i[1]\n", " for i in common:\n", " p.append(i[1]/total)\n", " replace_list.append(i[0])\n", " replace = np.random.choice(replace_list,missing_num,p)\n", " df.loc[df[feature] == -1, feature] = replace\n", " return df\n", "\n", "train = filling_level(train,'item_sales_level', 913)\n", "train = filling_level(train,'user_age_level', 964)\n", "train = filling_level(train, 'user_star_level', 964)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# replace missing data in these three columns with average\n", "items = ['shop_score_service', 'shop_score_delivery', 'shop_score_description']\n", "shop_service_mean = train.loc[train[items[0]] != -1, items[0]].mean()\n", "train.loc[train[items[0]] == -1, items[0]] = shop_service_mean\n", "shop_delivery_mean = train.loc[train[items[1]] != -1, items[1]].mean()\n", "train.loc[train[items[1]] == -1, items[1]] = shop_delivery_mean\n", "shop_description_mean = train.loc[train[items[2]] != -1, items[2]].mean()\n", "train.loc[train[items[2]] == -1, items[2]] = shop_description_mean" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(1.0, 265918), (0.9970402457701614, 1852)]\n" ] } ], "source": [ "# filling shop_review_positive_rate\n", "cc = Counter(train['shop_review_positive_rate'])\n", "print(cc.most_common(2))\n", "train.loc[train['shop_review_positive_rate'] == -1, 'shop_review_positive_rate'] = 1" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | instance_id | \n", "item_id | \n", "item_category_list | \n", "item_property_list | \n", "item_brand_id | \n", "item_city_id | \n", "item_price_level | \n", "item_sales_level | \n", "item_collected_level | \n", "item_pv_level | \n", "... | \n", "context_page_id | \n", "predict_category_property | \n", "shop_id | \n", "shop_review_num_level | \n", "shop_review_positive_rate | \n", "shop_star_level | \n", "shop_score_service | \n", "shop_score_delivery | \n", "shop_score_description | \n", "is_trade | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "108641074714126964 | \n", "3412720377098676069 | \n", "7908382889764677758;5799347067982556520 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "... | \n", "4006 | \n", "5799347067982556520:-1;509660095530134768:-1;5... | \n", "6765930309048922341 | \n", "4 | \n", "1.0 | \n", "5002 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "0 | \n", "
1 | \n", "5754713551599725161 | \n", "3412720377098676069 | \n", "7908382889764677758;5799347067982556520 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "... | \n", "4001 | \n", "5799347067982556520:9172976955054793469;790838... | \n", "6765930309048922341 | \n", "4 | \n", "1.0 | \n", "5002 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "0 | \n", "
2 | \n", "842679481291040981 | \n", "3412720377098676069 | \n", "7908382889764677758;5799347067982556520 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "... | \n", "4001 | \n", "5799347067982556520:5131280576272319091;725801... | \n", "6765930309048922341 | \n", "4 | \n", "1.0 | \n", "5002 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "0 | \n", "
3 | \n", "937088850059189027 | \n", "3412720377098676069 | \n", "7908382889764677758;5799347067982556520 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "... | \n", "4016 | \n", "509660095530134768:-1;5799347067982556520:-1;7... | \n", "6765930309048922341 | \n", "4 | \n", "1.0 | \n", "5002 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "0 | \n", "
4 | \n", "7975697065017708072 | \n", "3412720377098676069 | \n", "7908382889764677758;5799347067982556520 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "... | \n", "4001 | \n", "5799347067982556520:9172976955054793469;790838... | \n", "6765930309048922341 | \n", "4 | \n", "1.0 | \n", "5002 | \n", "1.0 | \n", "1.0 | \n", "1.0 | \n", "0 | \n", "
5 rows × 27 columns
\n", "\n", " | instance_id | \n", "item_id | \n", "item_property_list | \n", "item_brand_id | \n", "item_city_id | \n", "item_price_level | \n", "item_sales_level | \n", "item_collected_level | \n", "item_pv_level | \n", "user_id | \n", "... | \n", "time | \n", "is_midnight | \n", "is_morning | \n", "is_afternoon | \n", "is_night | \n", "item_category | \n", "cate_precision | \n", "cate_recall | \n", "prop_precision | \n", "prop_recall | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "108641074714126964 | \n", "3412720377098676069 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "4505772604969228686 | \n", "... | \n", "22:09:04 | \n", "0 | \n", "0 | \n", "0 | \n", "1 | \n", "5799347067982556520 | \n", "0.400000 | \n", "1.0 | \n", "0.0 | \n", "0.000000 | \n", "
1 | \n", "5754713551599725161 | \n", "3412720377098676069 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "2692638157208937547 | \n", "... | \n", "00:00:32 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "5799347067982556520 | \n", "1.000000 | \n", "1.0 | \n", "0.2 | \n", "0.045455 | \n", "
2 | \n", "842679481291040981 | \n", "3412720377098676069 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "5247924392014515924 | \n", "... | \n", "15:04:12 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "5799347067982556520 | \n", "0.666667 | \n", "1.0 | \n", "1.0 | \n", "0.045455 | \n", "
3 | \n", "937088850059189027 | \n", "3412720377098676069 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "2681414445369714628 | \n", "... | \n", "18:17:50 | \n", "0 | \n", "0 | \n", "1 | \n", "0 | \n", "5799347067982556520 | \n", "0.400000 | \n", "1.0 | \n", "0.0 | \n", "0.000000 | \n", "
4 | \n", "7975697065017708072 | \n", "3412720377098676069 | \n", "2072967855524022579;5131280576272319091;263639... | \n", "1975590437749032870 | \n", "3948283326616421003 | \n", "3 | \n", "3 | \n", "4 | \n", "14 | \n", "2729475788342039013 | \n", "... | \n", "07:48:40 | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "5799347067982556520 | \n", "1.000000 | \n", "1.0 | \n", "1.0 | \n", "0.045455 | \n", "
5 rows × 38 columns
\n", "