腾讯首个3D生成大模型Hunyuan3D-1.0分享
Hunyuan3D-1.0是腾讯混元团队开发的首个同时支持文字、图像转3D的大模型,可以基于文本描述或单张图像生成3D模型。
Hunyuan3D-1.0采用了多视图生成和多视图重建两阶段的方法,能够从不同视角捕捉对象的纹理和几何信息。
在多视图生成阶段,Hunyuan3D-1.0模型采用自适应CFG技术,为不同视角和时间步设置不同的CFG尺度值,以平衡生成控制和多样性。
在多视图重建阶段,Hunyuan3D-1.0模型结合了已校准和未校准的混合输入,通过专门的视角无关分支整合条件图像信息,提升生成图像中的不可见部分精度。
Hunyuan3D-1.0具备强大的泛化能力,可以重建不同尺度的物体,从建筑到花草工具等。
github项目地址:https://github.com/tencent/Hunyuan3D-1。
一、环境安装
1、python环境
建议安装python版本在3.10以上。
2、pip库安装
pip install torch==2.2.0+cu118 torchvision==0.17.0+cu118 torchaudio==2.2.0 --extra-index-url https://download.pytorch.org/whl/cu118
pip install diffusers transformers rembg tqdm omegaconf matplotlib opencv-python imageio jaxtyping einops SentencePiece accelerate trimesh PyMCubes xatlas libigl open3d -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install git+https://github.com/facebookresearch/pytorch3d
pip install git+https://github.com/NVlabs/nvdiffrast
3、模型下载:
git lfs install
git clone https://huggingface.co/tencent/Hunyuan3D-1
二、功能测试
1、运行测试:
(1)python代码调用测试
import os
import torch
from PIL import Image
import argparse
from infer import Text2Image, Removebg, Image2Views, Views2Mesh, GifRenderer
def initialize_models(args):
"""Initialize all the models required for the process."""
rembg_model = Removebg()
image_to_views_model = Image2Views(device=args.device, use_lite=args.use_lite)
views_to_mesh_model = Views2Mesh(
args.mv23d_cfg_path, args.mv23d_ckt_path, args.device, use_lite=args.use_lite
)
text_to_image_model = None
gif_renderer = None
if args.text_prompt:
text_to_image_model = Text2Image(
pretrain=args.text2image_path, device=args.device, save_memory=args.save_memory
)
if args.do_render:
gif_renderer = GifRenderer(device=args.device)
return rembg_model, image_to_views_model, views_to_mesh_model, text_to_image_model, gif_renderer
def text_to_image(text_to_image_model, args):
"""Convert text prompt to an image."""
res_rgb_pil = text_to_image_model(
args.text_prompt,
seed=args.t2i_seed,
steps=args.t2i_steps
)
return res_rgb_pil
def remove_background(rembg_model, res_rgb_pil):
"""Remove background from the image."""
return rembg_model(res_rgb_pil)
def image_to_views(image_to_views_model, res_rgba_pil, args):
"""Convert RGBA image to multiple views."""
return image_to_views_model(
res_rgba_pil,
seed=args.gen_seed,
steps=args.gen_steps
)
def views_to_mesh(views_to_mesh_model, views_grid_pil, cond_img, args):
"""Convert views into a 3D mesh."""
views_to_mesh_model(
views_grid_pil,
cond_img,
seed=args.gen_seed,
target_face_count=args.max_faces_num,
save_folder=args.save_folder,
do_texture_mapping=args.do_texture_mapping
)
def render_gif(gif_renderer, args):
"""Render a 3D mesh as a GIF."""
gif_renderer(
os.path.join(args.save_folder, 'mesh.obj'),
gif_dst_path=os.path.join(args.save_folder, 'output.gif'),
)
def save_image(image, path):
"""Save the PIL image to a specified path."""
image.save(path)
def get_args():
"""Parse and return the command-line arguments."""
parser = argparse.ArgumentParser()
parser.add_argument("--use_lite", default=False, action="store_true")
parser.add_argument("--mv23d_cfg_path", default="./svrm/configs/svrm.yaml", type=str)
parser.add_argument("--mv23d_ckt_path", default="Hunyuan3D-1/svrm/svrm.safetensors", type=str)
parser.add_argument("--text2image_path", default="Hunyuan3D-1/weights/hunyuanDiT", type=str)
parser.add_argument("--save_folder", default="./outputs/test/", type=str)
parser.add_argument("--text_prompt", default="", type=str)
parser.add_argument("--image_prompt", default="", type=str)
parser.add_argument("--device", default="cuda:0", type=str)
parser.add_argument("--t2i_seed", default=0, type=int)
parser.add_argument("--t2i_steps", default=25, type=int)
parser.add_argument("--gen_seed", default=0, type=int)
parser.add_argument("--gen_steps", default=50, type=int)
parser.add_argument("--max_faces_num", default=80000, type=int, help="max num of face, suggest 80000 for effect, 10000 for speed")
parser.add_argument("--save_memory", default=False, action="store_true")
parser.add_argument("--do_texture_mapping", default=False, action="store_true")
parser.add_argument("--do_render", default=False, action="store_true")
return parser.parse_args()
def main():
args = get_args()
assert not (args.text_prompt and args.image_prompt), "Specify either a text or an image prompt, not both"
assert args.text_prompt or args.image_prompt, "Either a text prompt or an image prompt must be specified"
os.makedirs(args.save_folder, exist_ok=True)
rembg_model, image_to_views_model, views_to_mesh_model, text_to_image_model, gif_renderer = initialize_models(args)
# Stage 1: Text to Image
if args.text_prompt:
res_rgb_pil = text_to_image(text_to_image_model, args)
save_image(res_rgb_pil, os.path.join(args.save_folder, "img.jpg"))
else:
res_rgb_pil = Image.open(args.image_prompt)
# Stage 2: Remove Background
res_rgba_pil = remove_background(rembg_model, res_rgb_pil)
save_image(res_rgba_pil, os.path.join(args.save_folder, "img_nobg.png"))
# Stage 3: Image to Views
(views_grid_pil, cond_img), _ = image_to_views(image_to_views_model, res_rgba_pil, args)
save_image(views_grid_pil, os.path.join(args.save_folder, "views.jpg"))
# Stage 4: Views to Mesh
views_to_mesh(views_to_mesh_model, views_grid_pil, cond_img, args)
# Stage 5: Render GIF
if args.do_render:
render_gif(gif_renderer, args)
if __name__ == "__main__":
main()
未完......
更多详细的欢迎关注:杰哥新技术
原文地址:https://blog.csdn.net/m0_71062934/article/details/143665783
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!