1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
| ''' 2.处理缺失值 2.1 对缺失值进行删除操作(行,列) 2.2 对缺失值进行填充操作(列的均值) 2.3 对缺失值对应的行或列进行标记 ''' df_miss = spark.createDataFrame([ (1, 143.5, 5.6, 28,'M', 100000), (2, 167.2, 5.4, 45,'M', None), (3, None , 5.2, None, None, None), (4, 144.5, 5.9, 33, 'M', None), (5, 133.2, 5.7, 54, 'F', None), (6, 124.1, 5.2, None, 'F', None), (7, 129.2, 5.3, 42, 'M', 76000),], ['id', 'weight', 'height', 'age', 'gender', 'income'])
df_miss.rdd.map(lambda row:(row['id'],sum([c==None for c in row]))).collect() [(1, 0), (2, 1), (3, 4), (4, 1), (5, 1), (6, 2), (7, 0)]
df_miss.agg(*[(1 - (fn.count(c) / fn.count('*'))).alias(c + '_missing') for c in df_miss.columns]).show()
df_miss_no_income = df_miss.select([ c for c in df_miss.columns if c != 'income' ])
df_miss_no_income.dropna(thresh=3).show()
means = df_miss_no_income.agg( *[fn.mean(c).alias(c) for c in df_miss_no_income.columns if c != 'gender']).toPandas().to_dict('records')[0]
means['gender'] = 'missing'
df_miss_no_income.fillna(means).show()
|