环境配置

在最开始简单的提一下环境配置

CUDA的版本

打开cmd,输入以下代码,来查看CUDA的版本

1
nvidia-smi


根据右上角的CUDA Version,到pytorch官网查找对应的版本

配环境&下载pytorch库

打开anaconda自带的cmd

1
2
3
conda create -n "环境名"
activate "你的环境名"
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

这里用CUDA12.6演示
如果没有英伟达的显卡或者电脑没有显卡选CPU版本

测试你的pytorch是否配置成功

在终端打以下代码

1
2
3
4
python

import pytorch
torch.cuda.is_available()

返回True则配置成功

大致的训练流程

  1. 搭建自己的dataset
  2. 用dataloader高效的加载数据并提供迭代器接口供训练循环使用
  3. 搭建自己的训练网络(model)
  4. 设置损失函数 (loss_fn),优化器(optimizer),学习率 (lr),批次 (epoch)
  5. 开始训练(可以通过.cuda传到GPU训练)
  6. 测试模型
  7. 保存模型以.pth的格式

Dataset

定义数据集结构和单个样本的获取方式(读取和预处理)
代码实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from torch.utils.data import Dataset
from PIL import Image
import os



class MyDataset(Dataset):
    def __init__(self, root_dir, label_dir):
        self.root_dir = root_dir
        self.label_dir = label_dir
        self.path = os.path.join(self.root_dir, self.label_dir)
        self.img_path = os.listdir(self.path)


    def __getitem__(self, idx):
        img_name =  self.img_path[idx]
        img_item_path = os.path.join(self.root_dir, self.label_dir, img_name)
        img = Image.open(img_item_path)
        label = self.label_dir
        return img, label


    def __len__(self):
        return len(self.img_path)



root_dir = 'hymenoptera_data/train'
ants_label_dir = 'ants'
bees_label_dir = 'bees'
bees_dataset = MyDataset(root_dir, bees_label_dir)
ants_dataset = MyDataset(root_dir, ants_label_dir)
#创建子数据集


train_dataset = ants_dataset + bees_dataset
#合并数据集
#ad:在训练中可以合并数据集,制作自己的数据集

这里是子类去继承父类的特征,但需要自己改写__getitem____len__方法

数据集获取:https://download.pytorch.org/tutorial/hymenoptera_data.zip

Dataloader

负责高效地(并行、预取)生成批次数据、打乱顺序,并提供简洁的迭代接口
torchvision&tensorboard的下载

1
pip install torchvision tensorboard

代码实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import  torchvision
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter


test_data = torchvision.datasets.CIFAR10("./dataset", train=False, transform=torchvision.transforms.ToTensor())
test_loader = DataLoader(dataset=test_data, batch_size=64, shuffle=True, num_workers=0, drop_last=True)


#测试数据集中第一张图片以及shape
img, target = test_data[0]
print(img.shape)
print(target)


writer = SummaryWriter('dataloader')
for epoch in range(2):
    step = 0
    for data in test_loader:
        imgs, target = data
        # print(img.shape) #torch.Size([64, 3, 32, 32])
        # print(target)
        writer.add_images('Epoch: {}'.format(epoch), imgs, step)
        step = step + 1

writer.close()

我们可以在我们自己创建的环境终端下打开
输入以下代码

1
tensonboard --logdir=logs

注意:如果报错将logs换成绝对路径

ctrl+左键打开终端给我们的地址就可看到

Model

从数据中学习规律来完成预测或决策任务
代码实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import torch
from torch import nn

class mrk_dataset(torch.nn.Module):
    def __init__(self):
        super(mrk_dataset, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(3, 32, 5, 1, 2),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 32, 5, 1, 2),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 5, 1, 2),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(64*4*4, 64),
            nn.Linear(64, 10)
   )

    def forward(self, x):
        x = self.model(x)
       return x


if __name__ == '__main__':
    m = mrk_dataset()
    input = torch.ones((64, 3, 32, 32))
    output = m(input)
    print(output.shape)

这是一个简单的卷积神经网络,利用torch中的nn工具,进行搭建
本质逻辑就是一个数学函数

$$
f(h(g(k(l(m(n()))))))
$$

通过正向传播和反向传播调整权重和偏置

Train

训练的整个过程
代码实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import torchvision
import torch
from model import *
from torch import nn
from torch.utils.tensorboard import SummaryWriter


#准备数据集
train_data = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())

test_data = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())


#length 长度
train_data_size = len(train_data)
test_data_size = len(test_data)

print("训练数据集的长度为:{}".format(train_data_size))
print("测试数据集的长度为:{}".format(test_data_size))


#利用DataLoader来加载数据集
train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=64, shuffle=True, num_workers=0, drop_last=False)

test_loader = torch.utils.data.DataLoader(dataset=test_data, batch_size=64, shuffle=True, num_workers=0, drop_last=False)


#创建网络模型
m = mrk_dataset()


#损失函数
loss_fn = nn.CrossEntropyLoss()


#优化器
learning_rate = 1e-2
optimizer = torch.optim.SGD(m.parameters(), lr=learning_rate)



#设置训练网络的一些参数
#记录训练的次数
total_train_step = 0
#记录测试的次数
total_test_step = 0
#训练的轮数
epoch = 10


#添加tensorboard
writer = SummaryWriter('./logs_train')


for i in range(epoch):
    print("-------第 {} 轮训练开始-------".format(i+1))

    #训练步骤开始
    m.train()
    for data in train_loader:
        imgs, targets = data
        outputs = m(imgs)
        loss = loss_fn(outputs, targets)


        #优化器优化模型
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        total_train_step = total_train_step + 1
        if total_train_step % 100 == 0:
            print("训练次数:{},Loss:{}".format(total_train_step, loss.item()))
            writer.add_scalar('train_loss', loss.item(), total_train_step)


    #测试步骤开始
    total_test_step = 0
    total_accuracy = 0
    with torch.no_grad():
        for data in test_loader:
            imgs, targets = data
            outputs = m(imgs)
            loss = loss_fn(outputs, targets)
            total_test_step = total_test_step + loss.item()
            accuracy = (outputs.argmax(1) == targets).sum()
            total_accuracy = total_accuracy + accuracy



    print("整体测试集上的Loss:{}".format(total_test_step))
    print("整体测试集上的正确率:{}".format(total_accuracy/test_data_size))
    writer.add_scalar('test_loss', total_test_step, total_test_step)
    writer.add_scalar('test_accuracy', total_accuracy/test_data_size, total_test_step)
    total_test_step  = total_test_step + 1


    torch.save(m, "m_{}.pth".format(i))
    print("模型已保存")


writer.close()


以上为整个训练过程

GPU训练

代码实现:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import torchvision
import torch
from torch import nn
import time


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=torchvision.transforms.ToTensor())
test_data = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=torchvision.transforms.ToTensor())


train_data_size = len(train_data)
test_data_size = len(test_data)
print("训练数据集的长度为:{}".format(train_data_size))
print("测试数据集的长度为:{}".format(test_data_size))


train_loader = torch.utils.data.DataLoader(dataset=train_data, batch_size=64, shuffle=True, num_workers=0, drop_last=False)
test_loader = torch.utils.data.DataLoader(dataset=test_data, batch_size=64, shuffle=True, num_workers=0, drop_last=False)


class mrk_dataset(torch.nn.Module):
    def __init__(self):
        super(mrk_dataset, self).__init__()
        self.model = nn.Sequential(
            nn.Conv2d(3, 32, 5, 1, 2),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 32, 5, 1, 2),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 5, 1, 2),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(64*4*4, 64),
            nn.Linear(64, 10)
        )
    def forward(self, x):
        x = self.model(x)
        return x

m = mrk_dataset()
m.to(device)


loss_fn = nn.CrossEntropyLoss()
loss_fn  = loss_fn.to(device)


learning_rate = 1e-2
optimizer = torch.optim.SGD(m.parameters(), lr=learning_rate)


total_train_step = 0
total_test_step = 0
epoch = 10

start_time = time.time()

for i in range(epoch):
    print("-------第 {} 轮训练开始-------".format(i+1))

    m.train()
    for data in train_loader:
        imgs, targets = data
        imgs = imgs.to(device)
        targets = targets.to(device)
        outputs = m(imgs)
        loss = loss_fn(outputs, targets)

 
optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        total_train_step = total_train_step + 1
        if total_train_step % 100 == 0:
            end_time = time.time()
            print("训练时间:{}".format(end_time-start_time))
            print("训练次数:{},Loss:{}".format(total_train_step, loss.item()))
           
    m.eval()
    total_test_step = 0
    total_accuracy = 0
    with torch.no_grad():
        for data in test_loader:
            imgs, targets = data
            imgs = imgs.to(device)
            targets = targets.to(device)
            outputs = m(imgs)
            loss = loss_fn(outputs, targets)
            total_test_step = total_test_step + loss.item()
            accuracy = (outputs.argmax(1) == targets).sum()
            total_accuracy = total_accuracy + accuracy


    print("整体测试集上的Loss:{}".format(total_test_step))
    print("整体测试集上的正确率:{}".format(total_accuracy/test_data_size))
    total_test_step  = total_test_step + 1


    # torch.save(m, "m_{}.pth".format(i))
    # #torch.save(m.state_dict(), "m_{}.pth".format(i))
    # print("模型已保存")

课程推荐:https://www.bilibili.com/video/BV1hE411t7RN/?spm_id_from=333.1387.0.0&vd_source=4ae638a32d27a4ed5423a629ef623fb1