背景:MidJorney是面向互联网的图像AIGC产品,在政企内部,存在大量需求训练内部的知识作为自己的AIGC工具。基本需求是信息安全考虑,合规考虑。
目标:通过自准备的数据训练MidJorney同类模型,成为私有化部署的AIGC产品。
Diffusion原理:
训练:
扩散过程:从X0到XT,采用高斯函数“扩散”,无需学习算法可以直接推导。
反扩散过程:从XT到X0,根据时间t等相关进行无监督学习,得到模型参数。
推理:
通过文本向量转为输入向量,反扩散给定轮次,得到最终图片X0
技术方案:
最新的方式是文本到图像的AIGC生成,但要求的GPU配置笔者手边电脑无法达到。
本例:预训练蝴蝶数据集,预初始随机点生成新的蝴蝶图片,无需任何输入。
模型:本例采用无条件的Diffusion图片生成(有条件的例如,文本到图像、图像到图像等)
时长:GPU3小时,CPU78小时
用法:直接命令生成蝴蝶图像
一、环境准备
1、安装Anaconda
查看Conda环境
conda env list
创建新环境
conda create -n mydiffusion python=3.8
activate mydiffusion
2、安装依赖包
#如果是CPU
pip install torch
#如果是GPU,首先查看本机的GPU版本
#去官网查看pytorch与CUDA对应,https://pytorch.org/get-started/previous-versions/
#以下是11.6cuda安装pytorch例子
nvcc -V
conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.6 -c pytorch -c nvidia
pip install diffusers[training]
#如果有提示其它包安装,看错误提示
pip install xxx
二、训练代码
train.py文件
from dataclasses import dataclass
from datasets import load_dataset
import matplotlib.pyplot as plt
from torchvision import transforms
import torch
from diffusers import UNet2DModel
import torch
from PIL import Image
from diffusers import DDPMScheduler
import torch.nn.functional as F
from diffusers.optimization import get_cosine_schedule_with_warmup
from diffusers import DDPMPipeline
import math
import os
from accelerate import Accelerator
from huggingface_hub import HfFolder, Repository, whoami
from tqdm.auto import tqdm
from pathlib import Path
from accelerate import notebook_launcher
import glob
#训练参数
@dataclass
class TrainingConfig:
image_size = 128 # the generated image resolution
train_batch_size = 16
eval_batch_size = 16 # how many images to sample during evaluation
num_epochs = 50
gradient_accumulation_steps = 1
learning_rate = 1e-4
lr_warmup_steps = 500
save_image_epochs = 10
save_model_epochs = 30
mixed_precision = "fp16" # `no` for float32, `fp16` for automatic mixed precision
output_dir = "ddpm-butterflies-128" # the model name locally and on the HF Hub
push_to_hub = False # whether to upload the saved model to the HF Hub
hub_private_repo = False
overwrite_output_dir = True # overwrite the old model when re-running the notebook
seed = 0
#gpu or cpu
if torch.cuda.is_available():
torch.cuda.empty_cache()
device = torch.device("cuda")
print("There are %d GPU(s) available." % torch.cuda.device_count())
print("We will use the GPU:", torch.cuda.get_device_name(0))
else:
print("No GPU available, using the CPU instead.")
device = torch.device("cpu")
config = TrainingConfig()
#加载数据集
config.dataset_name = "huggan/smithsonian_butterflies_subset"
dataset = load_dataset(config.dataset_name, split="train")
#查看数据集
# fig, axs = plt.subplots(1, 4, figsize=(16, 4))
# for i, image in enumerate(dataset[:4]["image"]):
# axs[i].imshow(image)
# axs[i].set_axis_off()
# fig.show()
# plt.pause(3000)
#预处理尺寸归一化
preprocess = transforms.Compose(
[
transforms.Resize((config.image_size, config.image_size)),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.5], [0.5]),
]
)
#预处理函数
def transform(examples):
images = [preprocess(image.convert("RGB")) for image in examples["image"]]
return {"images": images}
dataset.set_transform(transform)
#加载数据
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=config.train_batch_size, shuffle=True)
#创建模型
model = UNet2DModel(
sample_size=config.image_size, # the target image resolution
in_channels=3, # the number of input channels, 3 for RGB images
out_channels=3, # the number of output channels
layers_per_block=2, # how many ResNet layers to use per UNet block
block_out_channels=(128, 128, 256, 256, 512, 512), # the number of output channels for each UNet block
down_block_types=(
"DownBlock2D", # a regular ResNet downsampling block
"DownBlock2D",
"DownBlock2D",
"DownBlock2D",
"AttnDownBlock2D", # a ResNet downsampling block with spatial self-attention
"DownBlock2D",
),
up_block_types=(
"UpBlock2D", # a regular ResNet upsampling block
"AttnUpBlock2D", # a ResNet upsampling block with spatial self-attention
"UpBlock2D",
"UpBlock2D",
"UpBlock2D",
"UpBlock2D",
),
)
#验证形状
sample_image = dataset[0]["images"].unsqueeze(0)
print("Input shape:", sample_image.shape)
print("Output shape:", model(sample_image, timestep=0).sample.shape)
#创建执行计划
noise_scheduler = DDPMScheduler(num_train_timesteps=1000)
noise = torch.randn(sample_image.shape)
timesteps = torch.LongTensor([50])
noisy_image = noise_scheduler.add_noise(sample_image, noise, timesteps)
Image.fromarray(((noisy_image.permute(0, 2, 3, 1) + 1.0) * 127.5).type(torch.uint8).numpy()[0])
#创建损失函数
noise_pred = model(noisy_image, timesteps).sample
loss = F.mse_loss(noise_pred, noise)
#优化器
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
#调度器
lr_scheduler = get_cosine_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=config.lr_warmup_steps,
num_training_steps=(len(train_dataloader) * config.num_epochs),
)
#保存为网格
def make_grid(images, rows, cols):
w, h = images[0].size
grid = Image.new("RGB", size=(cols * w, rows * h))
for i, image in enumerate(images):
grid.paste(image, box=(i % cols * w, i // cols * h))
return grid
#DDPMPipeline
def evaluate(config, epoch, pipeline):
# Sample some images from random noise (this is the backward diffusion process).
# The default pipeline output type is `List[PIL.Image]`
images = pipeline(
batch_size=config.eval_batch_size,
generator=torch.manual_seed(config.seed),
).images
# Make a grid out of the images
image_grid = make_grid(images, rows=4, cols=4)
# Save the images
test_dir = os.path.join(config.output_dir, "samples")
os.makedirs(test_dir, exist_ok=True)
image_grid.save(f"{test_dir}/{epoch:04d}.png")
def get_full_repo_name(model_id: str, organization: str = None, token: str = None):
if token is None:
token = HfFolder.get_token()
if organization is None:
username = whoami(token)["name"]
return f"{username}/{model_id}"
else:
return f"{organization}/{model_id}"
def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler):
# 初始化加速器和张量板日志记录
accelerator = Accelerator(
mixed_precision=config.mixed_precision,
gradient_accumulation_steps=config.gradient_accumulation_steps,
# log_with="tensorboard",
# logging_dir=os.path.join(config.output_dir, "logs"),
)
if accelerator.is_main_process:
if config.push_to_hub:
repo_name = get_full_repo_name(Path(config.output_dir).name)
repo = Repository(config.output_dir, clone_from=repo_name)
elif config.output_dir is not None:
os.makedirs(config.output_dir, exist_ok=True)
accelerator.init_trackers("train_example")
# Prepare everything
# There is no specific order to remember, you just need to unpack the
# objects in the same order you gave them to the prepare method.
model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
model, optimizer, train_dataloader, lr_scheduler
)
global_step = 0
# 训练模型
for epoch in range(config.num_epochs):
progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
progress_bar.set_description(f"Epoch {epoch}")
for step, batch in enumerate(train_dataloader):
clean_images = batch["images"]
# 添加到图像的样本噪声
noise = torch.randn(clean_images.shape).to(clean_images.device)
bs = clean_images.shape[0]
# 为每个图像采样一个随机时间步长
timesteps = torch.randint(
0, noise_scheduler.config.num_train_timesteps, (bs,), device=clean_images.device
).long()
# 根据每个时间步的噪声幅度给干净图像添加噪声
# (这是前向扩散过程)
noisy_images = noise_scheduler.add_noise(clean_images, noise, timesteps)
with accelerator.accumulate(model):
# 预测噪声残差
noise_pred = model(noisy_images, timesteps, return_dict=False)[0]
loss = F.mse_loss(noise_pred, noise)
accelerator.backward(loss)
accelerator.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
progress_bar.set_postfix(**logs)
accelerator.log(logs, step=global_step)
global_step += 1
# 在每个 epoch 之后,您可以选择使用 evaluate() 对一些演示图像进行采样并保存模型
if accelerator.is_main_process:
pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)
if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1:
evaluate(config, epoch, pipeline)
if (epoch + 1) % config.save_model_epochs == 0 or epoch == config.num_epochs - 1:
if config.push_to_hub:
repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=True)
else:
pipeline.save_pretrained(config.output_dir)
#将训练循环、所有训练参数和进程数(您可以将此值更改为可用的 GPU 数)传递给函数以用于训练
# args = (config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler)
train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler)
#准备好使用 Accelerate 的notebook_launcher功能启动训练
# notebook_launcher(train_loop, args, num_processes=1)
#查看生成结果
sample_images = sorted(glob.glob(f"{config.output_dir}/samples/*.png"))
Image.open(sample_images[-1])
训练代码中的TrainingConfig,比较关键的几个配置:
在Terminal中运行:
python train.py
查看gpu,CMD输入以下命令
nvidia-smi
会提示请求连接关闭,多尝试几次,笔者尝试20多次才正常下载模型548M
三、推理代码
test.py
from diffusers import DiffusionPipeline
generator = DiffusionPipeline.from_pretrained("ddpm-butterflies-128")
generator.to("cuda")
image = generator().images[0]
image.save("generated_image.png")
在命令行中运行:
python test.py
四、输出
每次都是反扩散1000次的结果,预计50秒生成一张图。
以下是多次运行自动生成的蝴蝶:
五、总结
本例中仅使用蝴蝶进行训练,在数据集中生成新的蝴蝶符合预期,但部分图像有杂点。
在现实生产环境中,图像来源方方面面,我们应该注意在准备数据集的时候尽量平均不同类型的图片数量与质量,以防止模型的主观。
目前的性能离实时生成,性能上有一定的距离。
该模型源自Diffusion,2022年推广且比较新的模型,期待抛转引玉。