-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmain.py
More file actions
119 lines (101 loc) · 4.07 KB
/
main.py
File metadata and controls
119 lines (101 loc) · 4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding:utf-8 -*-
__author__ = 'xujia'
import pandas as pd
import numpy as np
import binning
import evaluate
import modeling
import woe
import feature_index
import feature_selection
import math
from pandas import Interval
from numpy import inf
from pprint import pprint
def file_info(file_path):
"""
获取文件信息
:param file_path: 文件路径
:return: {字段名称:[字段类型,数据量,空值个数]}
"""
info_dict = {}
raw_data = pd.read_csv(file_path)
for c in raw_data.columns:
c_type = raw_data[c].dtype
nc = raw_data[c].size - raw_data[c].notnull().sum()
info_dict[c] = [c_type, raw_data[c].size, nc] # 字段类型,数据量,空值个数
return info_dict, raw_data
def change_type(df, fea_type_dict):
"""
改变数据类型
:param df:
:param fea_type_dict:
:return:
"""
type_dict = {1: 'float64', 2: 'int64', 3: 'str'}
feature_dict = dict(zip(list(range(df.shape[1])), df.columns.values))
print('当前数据类型为:')
for (k, v) in fea_type_dict.items():
print(k.rjust(15), v[0])
print('字段名称对应数字为:')
for (n, m) in feature_dict.items():
print(n, m)
if_change = input('是否需要修改字段类型?(y/n)')
if if_change == 'y':
fea_name = int(input('请输入需要更改数据类型的字段对应的数字:'))
if fea_name not in feature_dict.keys():
fea_name = int(input('输入字段名称错误,请重新输入:'))
if fea_name not in fea_dict.keys():
pass
fea_name = feature_dict[fea_name]
target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):'))
if target_type not in type_dict.keys():
target_type = int(input('请输入目标类型对应的数字(1: 浮点型(float64),2: 整型(int64),3: 字符型(str):'))
if target_type not in type_dict.keys():
pass
target_type = type_dict[target_type]
df[fea_name] = df[fea_name].astype(target_type)
elif if_change == 'n':
pass
else:
pass
def split_data(data_to_split, ratio):
"""
数据分割
:param data_to_split:带分割数据
:param ratio:数据分割比例
:return: (数据集1,数据集2)
"""
data_count = data_to_split.shape[0]
selected_count = int(data_count * ratio)
if selected_count > 0:
splited_data = np.split(data.sample(frac=1), [selected_count], axis=0)
else:
return 'Data is too less'
return splited_data
if __name__ == '__main__':
# path=input('Please input the file path: ')
path = 'iris.csv'
fea_dict, data = file_info(path)
print('字段名', '数据类型', '数据总量', '缺失值个数')
pprint(fea_dict)
data = data.fillna(0.0)
change_type(data, fea_dict)
print(data.dtypes)
bin = binning.Bin(data, 'Label', 5)
for n in data.columns.values[:-1]:
bins = bin.chi_merge(n)
woe.add_woe_col(data, bins)
# 单变量ar值计算
# ar = ARUtil.cal_ar(data['SepalWidth_woe'], data['Label'])
train_data, test_data = split_data(data, 0.7)
model = modeling.model(train_data, ['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe'], 'Label')
predict_score = modeling.score_trans(test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe']], model, 300, 25)
pprint(list(zip(test_data['Label'].values, predict_score)))
auc = evaluate.auc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
print("auc值: " + str(auc))
evaluate.roc(model, test_data[['SepalLength_woe', 'PetalLength_woe', 'PetalWidth_woe', 'Label']])
# select_func = feature_selection.fea_select(data[['SepalLength', 'SepalWidth']], data['Label'], 1)
# print(select_func.transform(data[['SepalLength', 'SepalWidth']]))
# feature_selection.fea_select(data[['SepalLength_woe', 'SepalWidth_woe']], data['Label'])
# feature_selection.mi(data['SepalWidth_woe'], data['Label'])