【生成模型】Flux-Fill与量化
速度与显存占用测试
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'
from diffusers.utils import load_image
import math
def infer_test(pipe):
image = load_image("../dataset/tmp/cup.png")
mask = load_image("../dataset/tmp//cup_mask.png")
size = mask.size # (1232, 1632)
image = image.resize((size[0]//2, size[1]//2)) # (616, 816)
mask = mask.resize((size[0]//2, size[1]//2))
image_out = pipe(
prompt="a white paper cup",
image=image,
mask_image=mask,
height=mask.size[1],
width=mask.size[0],
guidance_scale=30,
num_inference_steps=50,
max_sequence_length=512,
generator=torch.Generator("cpu").manual_seed(0)
).images[0]
# image_out.save(f"flux-fill-dev.png")
return image_out
model | load GPU Mem | run GPU Mem (616, 816) | run GPU Mem (1232, 1632) |
---|---|---|---|
Flux-Flill (FP16, torch.bfloat16) | 33.7G | 36.2G | 39.7G (49s) |
Flux-Flill (FP16, torch.bfloat16): FluxTransformer2DModel | 24.2G | ||
Flux-Flill (torch.float8_e4m3fn): FluxTransformer2DModel, | 13.2G | ||
Flux-Flill (Q8_0.gguf, torch.bfloat16): FluxTransformer2DModel | 14.0G | ||
Flux-Flill (Q8_0.gguf, torch.bfloat16) | 23.5G | 27.8G (55s) | |
Flux-Fill (BitsAndBytes:load_in_8bit, torch.bfloat16): FluxTransformer2DModel | 13.3G | ||
Flux-Fill (BitsAndBytes:load_in_8bit, torch.bfloat16): | 22.7G | 27.1G (76s) | |
Flux-Fill (BitsAndBytes:float8_e4m3fn, torch.bfloat16): | 22.7G | 27.1G (71s) |
注意:
- 上面的速度是在单卡A100 40G上测得的
- gguf的测试版本只有FluxTransformer2DModel采用了GGUF量化
- gguf和BitsAndBytes只是使用了8bit存储模型权重,而模型计算还是要转成指定的计算类型(这里都转为bfloat16),因此计算速度相对bfloat16其实还会变慢。BitsAndBytes慢的更多
1. Load Flux-Fill
1.1 Load (FP16)
import torch
from diffusers import FluxFillPipeline, FluxTransformer2DModel
pipe = FluxFillPipeline.from_pretrained("../model_hub/black-forest-labs/FLUX.1-Fill-dev", torch_dtype=torch.bfloat16).to("cuda")
# pipe.enable_model_cpu_offload()
infer_test(pipe)
1.2 Load Flux-Fill (FP16)分步
import torch
from diffusers import FluxFillPipeline, FluxTransformer2DModel
model_name = "../model_hub/black-forest-labs/FLUX.1-Fill-dev"
transformer = FluxTransformer2DModel.from_pretrained(
model_name, subfolder="transformer",torch_dtype=torch.bfloat16).to('cuda')
pipe = FluxFillPipeline.from_pretrained(
"../model_hub/black-forest-labs/FLUX.1-Fill-dev",
transformer=transformer,
torch_dtype=torch.bfloat16).to("cuda")
1.3 Load Flux-Fill (FP16) to FP8
import torch
from diffusers import FluxFillPipeline, FluxTransformer2DModel
model_name = "../model_hub/black-forest-labs/FLUX.1-Fill-dev"
transformer = FluxTransformer2DModel.from_pretrained(
model_name, subfolder="transformer",torch_dtype=torch.float8_e4m3fn) # .to('cuda', torch.bfloat16) # torch 2.2.1+
1.4 Kijai/flux-fp8就是FP16转成torch.float8_e4m3fn直接save
Kijai/flux-fp8: https://huggingface.co/Kijai/flux-fp8 (ComfyUI中广泛使用)
import torch
from diffusers import FluxPipeline, FluxTransformer2DModel
# bfloat16
transformer1 = FluxTransformer2DModel.from_single_file(
"../projects/ComfyUI/models/unet/Kijai/flux-fp8/flux1-dev-fp8.safetensors",
config="../model_hub/black-forest-labs/FLUX.1-dev/", subfolder="transformer").to(torch.bfloat16)
transformer2 = FluxTransformer2DModel.from_pretrained(
"../model_hub/black-forest-labs/FLUX.1-dev/", subfolder="transformer").to(torch.float8_e4m3fn).to(torch.bfloat16)
transformer1_dict = {name: param for name, param in transformer1.named_parameters()}
transformer2_dict = {name: param for name, param in transformer2.named_parameters()}
s = 0
for name, d1 in transformer1_dict.items():
d2 = transformer2_dict[name]
d = (d1 - d2).abs().sum().item()
s += d
print(name, d)
print("diff is:", s)
time_text_embed.timestep_embedder.linear_1.weight 0.0
time_text_embed.timestep_embedder.linear_1.bias 0.0
......
transformer_blocks.15.norm1_context.linear.bias 0.0
......
diff is: 0.0
- load为torch.bfloat16,下面的代码无法节省显存也无法加快速度,效果等价于混合精度计算
- load为torch.float8_e4m3fn,计算需要cuda12.x(否则报错:RuntimeError: “index_select_cuda” not implemented for ‘Float8_e4m3fn’),没有测试
- FP8的混合精度计算需要对应的代码支持(ComfyUI是自己实现的支持):
- https://github.com/comfyanonymous/ComfyUI/blob/master/nodes.py#L2013 (“UNETLoader”: “Load Diffusion Model”)
- https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/ops.py
def infer_test_flux(pipe):
prompt = "A cat holding a sign that says hello world"
image = pipe(
prompt,
height=1024,
width=1024,
guidance_scale=3.5,
num_inference_steps=50,
max_sequence_length=512,
generator=torch.Generator("cpu").manual_seed(0)
).images[0]
# image.save("flux-dev.png")
return image
import torch
from diffusers import FluxPipeline, FluxTransformer2DModel
transformer = FluxTransformer2DModel.from_single_file(
"../projects/ComfyUI/models/unet/Kijai/flux-fp8/flux1-dev-fp8.safetensors",
config="../model_hub/black-forest-labs/FLUX.1-dev/", subfolder="transformer")
pipe = FluxPipeline.from_pretrained("../model_hub/black-forest-labs/FLUX.1-dev", transformer=transformer).to(torch.bfloat16).to("cuda")
infer_test_flux(pipe)
2. 混合精度推理
gguf和BitsAndBytes只是使用了8bit存储模型权重,而模型计算还是要转成指定的计算类型(这里都转为bfloat16),因此计算速度相对bfloat16其实还会变慢。BitsAndBytes慢的更多(不知道是不是和它是面向训练的有关)
2.1 load GGUF (uint8)
import torch
from diffusers import FluxTransformer2DModel, GGUFQuantizationConfig, FluxFillPipeline
transformer = FluxTransformer2DModel.from_single_file(
"../model_hub/YarvixPA/FLUX.1-Fill-dev-gguf/flux1-fill-dev-Q8_0.gguf",
# "https://huggingface.co/YarvixPA/FLUX.1-Fill-dev-gguf/blob/main/flux1-fill-dev-Q8_0.gguf",
config="../model_hub/black-forest-labs/FLUX.1-dev/", subfolder="transformer",
quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
torch_dtype=torch.bfloat16,
).to("cuda")
pipe = FluxFillPipeline.from_pretrained(
"../model_hub/black-forest-labs/FLUX.1-Fill-dev",
transformer=transformer,
torch_dtype=torch.bfloat16,
).to("cuda")
infer_test(pipe)
打印权重类型可以看到,只有transformer的weight是量化的uint8,因为不同类型的tensor是不能计算的,因此猜测应该是设置了GGUFQuantizationConfig(compute_dtype=torch.bfloat16),后,会在计算时候将uint8转为torch.bfloat16,然后计算,计算结果转回去或者保持torch.bfloat16,因此它的推理速度应该比原版还慢
for name, param in transformer.named_parameters():
print(name, param.dtype)
time_text_embed.text_embedder.linear_2.weight torch.bfloat16
time_text_embed.text_embedder.linear_2.bias torch.bfloat16
context_embedder.weight torch.bfloat16
context_embedder.bias torch.bfloat16
x_embedder.weight torch.bfloat16
x_embedder.bias torch.bfloat16
transformer_blocks.0.norm1.linear.weight torch.uint8
transformer_blocks.0.norm1.linear.bias torch.bfloat16
transformer_blocks.0.norm1_context.linear.weight torch.uint8
transformer_blocks.0.norm1_context.linear.bias torch.bfloat16
transformer_blocks.0.attn.norm_q.weight torch.bfloat16
transformer_blocks.0.attn.norm_k.weight torch.bfloat16
transformer_blocks.0.attn.to_q.weight torch.uint8
transformer_blocks.0.attn.to_q.bias torch.bfloat16
transformer_blocks.0.attn.to_k.weight torch.uint8
transformer_blocks.0.attn.to_k.bias torch.bfloat16
transformer_blocks.0.attn.to_v.weight torch.uint8
2.2 Load BitsAndBytes (int8)
# https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/flux.md#quantization
import torch
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, FluxTransformer2DModel, FluxFillPipeline
from transformers import BitsAndBytesConfig as BitsAndBytesConfig, T5EncoderModel
# quant_config = BitsAndBytesConfig(load_in_8bit=True)
# text_encoder_8bit = T5EncoderModel.from_pretrained(
# "../model_hub/black-forest-labs/FLUX.1-Fill-dev",
# subfolder="text_encoder_2",
# quantization_config=quant_config,
# torch_dtype=torch.bfloat16,
# )
quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True)
transformer_8bit = FluxTransformer2DModel.from_pretrained(
"../model_hub/black-forest-labs/FLUX.1-Fill-dev",
subfolder="transformer",
quantization_config=quant_config,
torch_dtype=torch.bfloat16,
)
pipe = FluxFillPipeline.from_pretrained(
"../model_hub/black-forest-labs/FLUX.1-Fill-dev",
# text_encoder_2=text_encoder_8bit,
transformer=transformer_8bit,
torch_dtype=torch.bfloat16,
).to("cuda")
应该与GGUF类似,也是将weight量化为8bit的数
for name, param in transformer_8bit.named_parameters():
print(name, param.dtype)
time_text_embed.timestep_embedder.linear_1.weight torch.int8
time_text_embed.timestep_embedder.linear_1.bias torch.bfloat16
time_text_embed.timestep_embedder.linear_2.weight torch.int8
time_text_embed.timestep_embedder.linear_2.bias torch.bfloat16
time_text_embed.guidance_embedder.linear_1.weight torch.int8
time_text_embed.guidance_embedder.linear_1.bias torch.bfloat16
time_text_embed.guidance_embedder.linear_2.weight torch.int8
time_text_embed.guidance_embedder.linear_2.bias torch.bfloat16
time_text_embed.text_embedder.linear_1.weight torch.int8
time_text_embed.text_embedder.linear_1.bias torch.bfloat16
time_text_embed.text_embedder.linear_2.weight torch.int8
time_text_embed.text_embedder.linear_2.bias torch.bfloat16
context_embedder.weight torch.int8
context_embedder.bias torch.bfloat16
x_embedder.weight torch.int8
x_embedder.bias torch.bfloat16
transformer_blocks.0.norm1.linear.weight torch.int8
transformer_blocks.0.norm1.linear.bias torch.bfloat16
transformer_blocks.0.norm1_context.linear.weight torch.int8
transformer_blocks.0.norm1_context.linear.bias torch.bfloat16
transformer_blocks.0.attn.norm_q.weight torch.bfloat16
transformer_blocks.0.attn.norm_k.weight torch.bfloat1
需要最新的bitsandbytes 0.45.1,可能需要远嘛安装
# https://huggingface.co/docs/bitsandbytes/main/en/installation?backend=Intel+CPU+%2B+GPU#multi-backend
cd bitsandbytes/
pip install intel_extension_for_pytorch
# Compile & install
export PATH="~/cmake-3.27.0-rc1/bin/:$PATH"
# apt-get install -y build-essential cmake # install build tools dependencies, unless present
cmake -DCOMPUTE_BACKEND=hip -S . # Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch
make -j16
pip install .
# pip install -e . # `-e` for "editable" install, when developing BNB (otherwise leave that out)
2.3 FP8混合精度
pip install optimum-quanto --no-deps
import torch
from diffusers import FluxTransformer2DModel, FluxPipeline, FluxFillPipeline
from transformers import T5EncoderModel, CLIPTextModel
from optimum.quanto import freeze, qfloat8, quantize
def to(model, device):
model = model.to("cuda")
for name, param in model.named_parameters():
param = param.to("cuda")
if hasattr(param, '_data'):
param._data = param._data.to("cuda")
param._scale = param._scale.to("cuda")
# print(name, param.device, param._data.device, param._scale.device)
elif len(param.__dict__) == 0:
pass
else:
print(name, param.__dict__.keys())
return model
bfl_repo = "../model_hub/black-forest-labs/FLUX.1-Fill-dev"
dtype = torch.bfloat16
transformer = FluxTransformer2DModel.from_pretrained(
bfl_repo, subfolder="transformer", torch_dtype=dtype) # 不能在这里to("cuda"),否则会讲torch.bfloat16的权重加载到显存
quantize(transformer, weights=qfloat8)
freeze(transformer)
transformer = to(transformer, "cuda") # 转换为float8后,加载到显存,没有写好的接口,得自己写
# text_encoder_2 = T5EncoderModel.from_pretrained(bfl_repo, subfolder="text_encoder_2", torch_dtype=dtype)
# quantize(text_encoder_2, weights=qfloat8)
# freeze(text_encoder_2)
# text_encoder_2 = to(text_encoder_2, "cuda") # 转换为float8后,加载到显存,没有写好的接口,得自己写
from diffusers import FluxTransformer2DModel, FluxPipeline, FluxFillPipeline
pipe = FluxFillPipeline.from_pretrained(bfl_repo, transformer=None, torch_dtype=dtype).to("cuda")
pipe.transformer = transformer
# pipe.text_encoder_2 = text_encoder_2
# https://github.com/huggingface/optimum-quanto/issues/343
infer_test(pipe)
如果推理的图片是噪声,参考 https://github.com/huggingface/optimum-quanto/issues/343
vim miniconda3/envs/xxx/lib/python3.10/site-packages/optimum/quanto/tensor/weights/qbytes.py
and torch.cuda.get_device_capability(data.device)[0] >= 8
=》
and torch.cuda.get_device_capability(data.device)[0] >= 20
可以查看内部的实现:
for name, param in transformer.named_parameters():
print(name, param.__dict__)
print(name, param.device, param._data.device, param._scale.device,)
break
time_text_embed.timestep_embedder.linear_1.weight {'_qtype': qtype(name='qfloat8_e4m3fn', is_floating_point=True, bits=8, dtype=torch.float8_e4m3fn, qmin=-448.0, qmax=448.0), '_axis': 0, '_data': tensor([[ -56.0000, -24.0000, 28.0000, ..., -240.0000, 192.0000,
-15.0000],
[ 26.0000, -7.0000, -10.0000, ..., 160.0000, -72.0000,
176.0000],
[ 18.0000, -22.0000, -20.0000, ..., 176.0000, 288.0000,
80.0000],
...,
[ 3.0000, 40.0000, 2.7500, ..., 56.0000, 112.0000,
-24.0000],
[ -36.0000, -56.0000, 36.0000, ..., -160.0000, 256.0000,
352.0000],
[ -56.0000, -28.0000, -52.0000, ..., 240.0000, 320.0000,
112.0000]], device='cuda:0', dtype=torch.float8_e4m3fn), '_scale': tensor([[5.6744e-05],
[5.7220e-05],
[6.5804e-05],
...,
[5.3644e-05],
[5.1975e-05],
[5.6982e-05]], device='cuda:0', dtype=torch.bfloat16), 'activation_qtype': None, '_is_param': True}
time_text_embed.timestep_embedder.linear_1.weight cuda:0 cuda:0 cuda:0
3. 其他
3.1 一些专业化的量化工具
- LLM量化:https://github.com/vllm-project/llm-compressor/tree/main/examples/quantization_w8a8_fp8
- torchao量化训练(需要torch2.5+):https://github.com/pytorch/ao/tree/main/torchao/float8
- transformer_engine: https://docs.nvidia.com/deeplearning/transformer-engine/user-guide/examples/fp8_primer.html
- pytorch直接准换类型(ComfyUI节点):https://github.com/Shiba-2-shiba/ComfyUI_DiffusionModel_fp8_converter/blob/main/clip_fp8_convert.py
- 针对transformers库中模型的量化:https://github.com/huggingface/optimum-nvidia
原文地址:https://blog.csdn.net/yinglang19941010/article/details/145282910
免责声明:本站文章内容转载自网络资源,如侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!