在CentOS上實現PyTorch的并行計算,通常涉及以下幾個方面:
多GPU并行:使用PyTorch的DataParallel或DistributedDataParallel模塊來在多個GPU上進行并行計算。
多節點并行:使用PyTorch的分布式包(torch.distributed)來實現跨多個節點的并行計算。
下面是一些基本的步驟和示例代碼:
DataParallelimport torch
import torch.nn as nn
from torchvision import models
# 定義模型
model = models.resnet50(pretrained=True)
# 檢查是否有可用的GPU
if torch.cuda.device_count() > 1:
print(f"Let's use {torch.cuda.device_count()} GPUs!")
# 包裝模型
model = nn.DataParallel(model)
# 將模型發送到GPU
model.to('cuda')
# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# 訓練模型
for epoch in range(num_epochs):
for inputs, labels in dataloader:
inputs, labels = inputs.to('cuda'), labels.to('cuda')
# 前向傳播
outputs = model(inputs)
loss = criterion(outputs, labels)
# 反向傳播和優化
optimizer.zero_grad()
loss.backward()
optimizer.step()
DistributedDataParallelimport torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
from torch.utils.data import DataLoader, DistributedSampler
from torchvision import datasets, transforms
# 初始化分布式環境
dist.init_process_group(backend='nccl')
# 定義模型
model = models.resnet50(pretrained=True).to(torch.device("cuda"))
# 包裝模型
model = DDP(model)
# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss().to(torch.device("cuda"))
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# 數據加載
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_sampler = DistributedSampler(train_dataset)
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
# 訓練模型
for epoch in range(num_epochs):
train_sampler.set_epoch(epoch)
for inputs, labels in train_loader:
inputs, labels = inputs.to(torch.device("cuda")), labels.to(torch.device("cuda"))
# 前向傳播
outputs = model(inputs)
loss = criterion(outputs, labels)
# 反向傳播和優化
optimizer.zero_grad()
loss.backward()
optimizer.step()
多節點并行通常涉及更復雜的設置,包括網絡配置、IP地址和端口設置等。以下是一個簡單的示例:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
from torch.utils.data import DataLoader, DistributedSampler
from torchvision import datasets, transforms
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'master_ip'
os.environ['MASTER_PORT'] = '12345'
dist.init_process_group(backend='nccl', init_method='env://', world_size=world_size, rank=rank)
def cleanup():
dist.destroy_process_group()
def train(rank, world_size):
setup(rank, world_size)
# 定義模型
model = models.resnet50(pretrained=True).to(torch.device(f"cuda:{rank}"))
# 包裝模型
model = DDP(model, device_ids=[rank])
# 定義損失函數和優化器
criterion = nn.CrossEntropyLoss().to(torch.device(f"cuda:{rank}"))
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# 數據加載
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_sampler = DistributedSampler(train_dataset)
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
# 訓練模型
for epoch in range(num_epochs):
train_sampler.set_epoch(epoch)
for inputs, labels in train_loader:
inputs, labels = inputs.to(torch.device(f"cuda:{rank}")), labels.to(torch.device(f"cuda:{rank}"))
# 前向傳播
outputs = model(inputs)
loss = criterion(outputs, labels)
# 反向傳播和優化
optimizer.zero_grad()
loss.backward()
optimizer.step()
cleanup()
if __name__ == "__main__":
world_size = 4 # 總共的節點數
mp.spawn(train, args=(world_size,), nprocs=world_size, join=True)
在這個示例中,mp.spawn用于啟動多個進程,每個進程對應一個節點。你需要根據實際情況設置MASTER_ADDR和MASTER_PORT。
MASTER_ADDR、MASTER_PORT和其他相關環境變量。DataParallel或DistributedDataParallel。torch.distributed包。通過這些步驟,你可以在CentOS上實現PyTorch的并行計算。