初赛 赛题 大赛初赛数据共包含两个文件,训练文件d_train.csv和测试文件d_test.csv,每个文件第一行是字段名,之后每一行代表一个个体。文件共包含42个字段,包含数值型、字符型、日期型等众多数据类型,部分字段内容在部分人群中有缺失,其中第一列为个体ID号。训练文件的最后一列为标签列,既需要预测的目标血糖值。
数据
分析 对于缺失严重的数据(乙肝抗原抗体相关数据,乙肝病毒与糖尿病的相关性不大)直接删掉,缺少性别的用男性补全,删掉年龄小于16以及血糖大于18的异常数据1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 def make_feat (train, test) : train_id = train.id.values.copy() test_id = test.id.values.copy() train = train[train['old' ]>=16 ] train = train[train['xuetang' ]<=18 ] data = pd.concat([train, test]) data['male' ] = data['male' ].map({'男' : 1 , '女' : 0 ,'??' :1 }) data.drop(['yigan_biaomian_kangyuan' , 'yigan_biaomian_kangti' , 'yigan_e_kangyuan' , 'yigan_e_kangti' , 'yigan_hexin_kangti' ,'date' ], axis=1 , inplace=True ) train_feat = data[data.id.isin(train_id)] test_feat = data[data.id.isin(test_id)] del train_feat['id' ] del test_feat['id' ] return train_feat, test_feat
1 2 3 4 def evalerror (pred, df) : label = df.get_label().values.copy() score = mean_squared_error(label, pred) * 0.5 return ('mse' , score, False )
1 2 3 4 5 6 7 8 def xiaoshu (array) : new_list = [] for i in array: j = sum(i)/5 new_list.append(round(j, 3 )) return np.array(new_list) def to_csv (DF,file_name) : pd.DataFrame(DF).to_csv(file_name, index = False , header = None )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 train = pd.read_csv('d_train_20180102_b.csv' ) test = pd.read_csv('d_test_B_20180128.csv' ) train_feat, test_feat = make_feat(train, test) predictors = [f for f in test_feat.columns if f not in ['xuetang' ]] params = {'learning_rate' : 0.01 ,'boosting_type' : 'gbdt' ,'objective' : 'regression' ,'metric' : 'mse' ,'sub_feature' : 0.9 ,'num_leaves' : 16 ,'colsample_bytree' : 0.9 ,'verbose' : -1 } cores = [] train_preds = np.zeros(train_feat.shape[0 ]) test_preds = np.zeros((test_feat.shape[0 ], 5 )) kf = KFold(len(train_feat), n_folds=5 , shuffle=True , random_state=520 ) for i, (train_index, test_index) in enumerate(kf): train_feat1 = train_feat.iloc[train_index] train_feat2 = train_feat.iloc[test_index] lgb_train1 = lgb.Dataset(train_feat1[predictors], train_feat1['xuetang' ], categorical_feature=['male' ]) lgb_train2 = lgb.Dataset(train_feat2[predictors], train_feat2['xuetang' ]) gbm = lgb.train(params,lgb_train1,num_boost_round=3000 ,valid_sets=lgb_train2,verbose_eval=100 ,feval=evalerror,early_stopping_rounds=150 ) feat_imp = pd.Series(gbm.feature_importance(), index=predictors).sort_values(ascending=False ) train_preds[test_index] += gbm.predict(train_feat2[predictors]) test_preds[:, i] = gbm.predict(test_feat[predictors]) to_csv(xiaoshu(test_preds), 'result_B.csv' )