python期末课程设计(机器学习的初步尝试)

主要内容

  • 熟悉numpy和pandas扩展库(可见往期博客)
  • 理解逻辑回归模型实现鸢尾花分类
  • mnist手写体识别

线性回归模型

线性回归是通过使用y=ax+b,通过调整a和b的值来使这条直线趋近于所有的数据点,即让他们的方差(或标准差)最小,而他们的方差是一个凸函数,所以可以通过求导,得到变化的趋势,从而降低方差,得到a和b的最优解。

到逻辑回归

sigmod函数,他将所有的x的值都映射在了0-1之间,这样也可以将其看成输入一个x值时总能得到一个唯一的0-1的概率
2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
# sigmod绘制代码
import matplotlib.pyplot as plt
import numpy as np

def sigmod(x):
return 1/(1+np.exp(-x))
list=np.arange(-10,10,0.1)
res=sigmod(list)
ax=plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
ax.spines['left'].set_position(('data',0))
ax.spines['bottom'].set_position(('data',0))
plt.xlabel('x')
plt.ylabel('y')
plt.plot(list,res)
plt.plot([-10,10],[0.5,0.5],linestyle='--')
plt.show()

逻辑回归就是在线性回归的基础上加了一个sigmod函数,将分母上面的那个Z替换为一个Z = W0+W1X1+W2X2+…+WnXn, 再用一个代价函数表示预测值与真实值之间的差距,这个代价函数常用的是最小二乘法,也就是方差,也可以是交叉熵损失函数。
1

有了损失函数就可以利用梯度下降法,通过多次训练将损失函数减到最低的所有的w的值。X的个数代表了特征的维度

这里的表述可能不是很标准,其中的推导过程要复杂的多

鸢尾花识别

该数据集测量了所有150个样本的4个特征,分别是:

sepal length(花萼长度)
sepal width(花萼宽度)
petal length(花瓣长度)
petal width(花瓣宽度)

以上四个特征的单位都是厘米(cm)。

通常使用m表示样本量的大小,n表示每个样本所具有的特征数。因此在该数据集中,m=150,n=4

二维特征鸢尾花识别及可视化(以花瓣宽和长进行逻辑回归为例)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

type1=2
type2=3
# 可视化部分
def plot_decision_regions(X, y, classifier,key, resolution=0.02):
markers = ('s', 'x', 'o')
colors = ('red', 'Blue', 'lightgreen')
cmap = ListedColormap(colors[:len(np.unique(y))])
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
print(len(xx1[0]))
print('-----')
print(xx2)
print('------')
print(Z)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],alpha=0.8, c=cmap(idx),marker=markers[idx], label=key[cl])

iris = datasets.load_iris()
dataname=['sepal length','sepal width','petal length','petal width']
X = iris.data[:, [type1,type2]]
y = iris.target
name = iris.target_names

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
# 数据标准化
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train_std, y_train)
X_test_std=np.array(X_test_std[:,:]).reshape(len(X_test_std),2)
res=list(lr.predict_proba(X_test_std))
plt.scatter([-100],[-100],label="error",c='cyan',marker='^')
for i in range(len(res)):
for k in range(len(res[i])):
if res[i][k] == max(res[i]):
print("测试%d:是%s类的概率为:%.4f"%(i,name[k],max(res[i])),end=':')
if k == y_test[i]:
print('正确')
else:
print('错误',end=' ')
print(X_test_std[i],"应该是",name[y_test[i]])
plt.scatter(X_test_std[i][0], X_test_std[i][1], c='cyan', alpha=1, linewidth=10, marker='^')
break;
plot_decision_regions(X_combined_std, y_combined, classifier=lr,key=name)
plt.xlabel(dataname[type1])
plt.ylabel(dataname[type2])
plt.legend(loc='upper left')
plt.show()

3

四维特征鸢尾花识别

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from sklearn.model_selection import  train_test_split
import pandas as pd
import numpy as np

#导入数据
name=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']
Species=['Iris-setosa','Iris-versicolor','Iris-virginica']
iris = pd.read_csv('Iris.csv',index_col=0)
X=np.array(iris.loc[:,name])
y=np.array((iris.loc[:,'Species']))
y=[ Species.index(i) for i in y]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
# 数据标准化
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

from sklearn.linear_model import LogisticRegression
#设置逻辑回归对象
lr = LogisticRegression(C=1000.0, random_state=0)
lr.fit(X_train_std, y_train)
res=list(lr.predict_proba(X_test_std))
for i in range(len(res)):
for k in range(len(res[i])):
if res[i][k] == max(res[i]):
print("测试%d:是%s类的概率为:%.4f"%(i,name[k],max(res[i])),end=':')
if k == y_test[i]:
print('正确')
else:
print('错误',end=' ')
print(X_test[i],"应该是",name[y_test[i]])
break

4

使用pytorch,卷积神经网络实现mnist手写体识别

在本次识别任务中,我选择了卷积神经网络(CNN)的做法来实现,mnist数据是28*28的图片及其表示的数字标签,数据导入可以使用pytorch自带的mnist数据进行测试训练,分析原数据得知,训练数据60000,测试数据10000。

神经网络结构

由于数据非常单一,且容易识别,所以选择了较为简单的神经网络结构,两层

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
class ConvNet(nn.Module):
def __init__(self):
super().__init__()
self.conv1=nn.Conv2d(1,10,5) # input:(1,28,28) output:(10,24,24)
self.conv2=nn.Conv2d(10,20,3) # input:(10,12,12) output:(20,10,10)
self.fc1 = nn.Linear(20*10*10,500)
self.fc2 = nn.Linear(500,10)
def forward(self,x):
in_size = x.size(0)
out = self.conv1(x)
out = F.relu(out)
out = F.max_pool2d(out, 2, 2)
out = self.conv2(out)
out = F.relu(out)
out = out.view(in_size,-1)
out = self.fc1(out)
out = F.relu(out)
out = self.fc2(out)
out = F.log_softmax(out,dim=1)
return out

训练部分代码

1
2
3
4
5
6
7
8
9
10
11
12
13
def train(model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if(batch_idx+1)%30 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))

测试部分代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # 将一批的损失相加
pred = output.max(1, keepdim=True)[1] # 找到概率最大的下标
correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
torch.__version__

BATCH_SIZE=512
EPOCHS=1
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_loader = torch.utils.data.DataLoader(datasets.MNIST('data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=BATCH_SIZE, shuffle=True)

test_loader = torch.utils.data.DataLoader(
datasets.MNIST('data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])),
batch_size=BATCH_SIZE, shuffle=True)

class ConvNet(nn.Module):
def __init__(self):
super().__init__()
self.conv1=nn.Conv2d(1,10,5) # input:(1,28,28) output:(10,24,24)
self.conv2=nn.Conv2d(10,20,3) # input:(10,12,12) output:(20,10,10)
self.fc1 = nn.Linear(20*10*10,500)
self.fc2 = nn.Linear(500,10)
def forward(self,x):
in_size = x.size(0)
out = self.conv1(x)
out = F.relu(out)
out = F.max_pool2d(out, 2, 2)
out = self.conv2(out)
out = F.relu(out)
out = out.view(in_size,-1)
out = self.fc1(out)
out = F.relu(out)
out = self.fc2(out)
out = F.log_softmax(out,dim=1)
return out

model = ConvNet().to(DEVICE) # 将网络移动到gpu上
optimizer = optim.Adam(model.parameters()) # 使用Adam优化器

def train(model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if(batch_idx+1)%30 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))

def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item() # 将一批的损失相加
pred = output.max(1, keepdim=True)[1] # 找到概率最大的下标
correct += pred.eq(target.view_as(pred)).sum().item()

test_loss /= len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))

for epoch in range(1, EPOCHS + 1):
try:
model.load_state_dict(torch.load('model.pt'))
print('导入神经网络成功')
except:
print('没有可导入的神经网络')
pass
train(model, DEVICE, train_loader, optimizer, epoch)
torch.save(model.state_dict(),'model.pt')
print('保存成功')
test(model, DEVICE, test_loader)
坚持原创技术分享,您的支持也将成为我的动力!
-------------本文结束感谢您的阅读-------------
undefined