Intro

数据集来自Kaggle的 https://www.kaggle.com/competitions/binary-classification-with-a-bank-churn-test/data

旨在练习DNN的书写

GPT辅助生成注释

code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset


/kaggle/input/playground-series-s4e1/train.csv
data=pd.read_csv('/kaggle/input/playground-series-s4e1/train.csv')
data=data.drop(["id"],axis=1)
data=data.drop(["CustomerId"],axis=1)
data=data.drop(["Surname"],axis=1)


from sklearn.preprocessing import LabelEncoder
data.iloc[:,1] = LabelEncoder().fit_transform(data.iloc[:,1])

data.iloc[:,2] = LabelEncoder().fit_transform(data.iloc[:,2])

from sklearn.model_selection import train_test_split
# 假设 data 是已经加载的 DataFrame
data.iloc[:, 1] = LabelEncoder().fit_transform(data.iloc[:, 1])
data.iloc[:, 2] = LabelEncoder().fit_transform(data.iloc[:, 2])

# 转换数据类型
for col in data.columns:
data[col] = pd.to_numeric(data[col], errors='coerce')

# 处理缺失值
data = data.fillna(0) # 或使用 data = data.dropna()
# 创建训练数据
x_train,x_test,y_train,y_test=train_test_split(torch.tensor(data.iloc[:, :-1].values, dtype=torch.float32),torch.tensor(data.iloc[:, -1].values, dtype=torch.float32),test_size=0.3)


# 创建 Tensor 数据集和数据加载器
dataset = TensorDataset(x_train, y_train)
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


class DNN(nn.Module):
def __init__(self):
super(DNN, self).__init__()
self.net = nn.Sequential(
nn.Linear(10, 32), nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(32, 16), nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(16, 8), nn.ReLU(),
nn.Linear(8, 4), nn.ReLU(),
nn.Linear(4, 1), # 移除 ReLU
nn.Sigmoid() # 添加 Sigmoid 激活
)

def forward(self, x):
return self.net(x)


# 初始化模型、损失函数和优化器
model = DNN()
loss_fn = nn.BCELoss() # 使用二元交叉熵损失
optimizer = torch.optim.SGD(model.parameters(), lr=0.001) # 尝试适当的学习率


# 训练
epochs = 20
losses = []
add Codeadd Markdown
for epoch in range(epochs):
print('now is epoch:', epoch)
epoch_loss = 0
correct_predictions = 0
total_predictions = 0

for batch_x, batch_y in train_loader:
optimizer.zero_grad() # 清除梯度
Pred = model(batch_x) # 前向传播

# 将预测值转换为 0 或 1
Pred_binary = (Pred > 0.5).float() # 使用 Sigmoid 输出的概率进行分类
correct_predictions += (Pred_binary == batch_y).sum().item()
total_predictions += batch_y.size(0)

loss = loss_fn(Pred.view(-1), batch_y) # 将 Pred 形状调整为 [batch_size]
epoch_loss += loss.item() # 累加损失
loss.backward() # 反向传播
optimizer.step() # 更新参数

accuracy = correct_predictions / total_predictions * 100 # 计算准确率
losses.append(epoch_loss / len(train_loader)) # 记录每个epoch的平均损失
print(f'Loss: {epoch_loss / len(train_loader):.4f}, Accuracy: {accuracy:.2f}%') # 打印损失和准确率
for epoch in range(epochs):
print('now is epoch:', epoch)
epoch_loss = 0
correct_predictions = 0
total_predictions = 0

for batch_x, batch_y in train_loader:
optimizer.zero_grad() # 清除梯度
Pred = model(batch_x) # 前向传播

# 将预测值转换为 0 或 1
Pred_binary = (Pred > 0.5).float() # 使用 Sigmoid 输出的概率进行分类

# 计算正确预测的数量
correct_predictions += (Pred_binary.view(-1) == batch_y.view(-1)).sum().item() # 扁平化比较
total_predictions += batch_y.size(0)

# 计算损失
loss = loss_fn(Pred.view(-1), batch_y) # 确保形状匹配
epoch_loss += loss.item() # 累加损失
loss.backward() # 反向传播
optimizer.step() # 更新参数

# 计算准确率
accuracy = correct_predictions / total_predictions * 100 # 计算准确率
losses.append(epoch_loss / len(train_loader)) # 记录每个 epoch 的平均损失
print(f'Loss: {epoch_loss / len(train_loader):.4f}, Accuracy: {accuracy:.2f}%') # 打印损失和准确率

在测试集上测试

1
2
3
4
5
6
7
8
9
10
11
with torch.no_grad():
pred = model(x_test) # 进行预测
pred_binary = (pred > 0.5).float() # 大于 0.5 设为 1,其他设为 0

# 计算正确预测的数量
correct = torch.sum(pred_binary.view(-1) == y_test.view(-1)) # 扁平化比较
total = y_test.size(0)

print(f'测试集准确度:{100 * correct / total:.2f}%')
测试集准确度:78.50%