自学内容网 自学内容网

腾讯首个3D生成大模型Hunyuan3D-1.0分享

Hunyuan3D-1.0是腾讯混元团队开发的首个同时支持文字、图像转3D的大模型,可以基于文本描述或单张图像生成3D模型。

Hunyuan3D-1.0采用了多视图生成和多视图重建两阶段的方法,能够从不同视角捕捉对象的纹理和几何信息。

在多视图生成阶段,Hunyuan3D-1.0模型采用自适应CFG技术,为不同视角和时间步设置不同的CFG尺度值,以平衡生成控制和多样性。

在多视图重建阶段,Hunyuan3D-1.0模型结合了已校准和未校准的混合输入,通过专门的视角无关分支整合条件图像信息,提升生成图像中的不可见部分精度。

Hunyuan3D-1.0具备强大的泛化能力,可以重建不同尺度的物体,从建筑到花草工具等。

github项目地址:https://github.com/tencent/Hunyuan3D-1。

一、环境安装

1、python环境

建议安装python版本在3.10以上。

2、pip库安装

pip install torch==2.2.0+cu118 torchvision==0.17.0+cu118 torchaudio==2.2.0 --extra-index-url https://download.pytorch.org/whl/cu118

pip install diffusers transformers rembg tqdm omegaconf matplotlib opencv-python imageio jaxtyping einops SentencePiece accelerate trimesh PyMCubes xatlas libigl open3d -i https://pypi.tuna.tsinghua.edu.cn/simple

pip install git+https://github.com/facebookresearch/pytorch3d

pip install git+https://github.com/NVlabs/nvdiffrast

3、模型下载

git lfs install

git clone https://huggingface.co/tencent/Hunyuan3D-1

、功能测试

1、运行测试

(1)python代码调用测试

import os
import torch
from PIL import Image
import argparse

from infer import Text2Image, Removebg, Image2Views, Views2Mesh, GifRenderer

def initialize_models(args):
    """Initialize all the models required for the process."""
    rembg_model = Removebg()
    image_to_views_model = Image2Views(device=args.device, use_lite=args.use_lite)
    views_to_mesh_model = Views2Mesh(
        args.mv23d_cfg_path, args.mv23d_ckt_path, args.device, use_lite=args.use_lite
    )
    text_to_image_model = None
    gif_renderer = None
    
    if args.text_prompt:
        text_to_image_model = Text2Image(
            pretrain=args.text2image_path, device=args.device, save_memory=args.save_memory
        )
        
    if args.do_render:
        gif_renderer = GifRenderer(device=args.device)
    
    return rembg_model, image_to_views_model, views_to_mesh_model, text_to_image_model, gif_renderer

def text_to_image(text_to_image_model, args):
    """Convert text prompt to an image."""
    res_rgb_pil = text_to_image_model(
        args.text_prompt,
        seed=args.t2i_seed,
        steps=args.t2i_steps
    )
    return res_rgb_pil

def remove_background(rembg_model, res_rgb_pil):
    """Remove background from the image."""
    return rembg_model(res_rgb_pil)

def image_to_views(image_to_views_model, res_rgba_pil, args):
    """Convert RGBA image to multiple views."""
    return image_to_views_model(
        res_rgba_pil,
        seed=args.gen_seed,
        steps=args.gen_steps
    )

def views_to_mesh(views_to_mesh_model, views_grid_pil, cond_img, args):
    """Convert views into a 3D mesh."""
    views_to_mesh_model(
        views_grid_pil,
        cond_img,
        seed=args.gen_seed,
        target_face_count=args.max_faces_num,
        save_folder=args.save_folder,
        do_texture_mapping=args.do_texture_mapping
    )

def render_gif(gif_renderer, args):
    """Render a 3D mesh as a GIF."""
    gif_renderer(
        os.path.join(args.save_folder, 'mesh.obj'),
        gif_dst_path=os.path.join(args.save_folder, 'output.gif'),
    )

def save_image(image, path):
    """Save the PIL image to a specified path."""
    image.save(path)

def get_args():
    """Parse and return the command-line arguments."""
    parser = argparse.ArgumentParser()
    parser.add_argument("--use_lite", default=False, action="store_true")
    parser.add_argument("--mv23d_cfg_path", default="./svrm/configs/svrm.yaml", type=str)
    parser.add_argument("--mv23d_ckt_path", default="Hunyuan3D-1/svrm/svrm.safetensors", type=str)
    parser.add_argument("--text2image_path", default="Hunyuan3D-1/weights/hunyuanDiT", type=str)
    parser.add_argument("--save_folder", default="./outputs/test/", type=str)
    parser.add_argument("--text_prompt", default="", type=str)
    parser.add_argument("--image_prompt", default="", type=str)
    parser.add_argument("--device", default="cuda:0", type=str)
    parser.add_argument("--t2i_seed", default=0, type=int)
    parser.add_argument("--t2i_steps", default=25, type=int)
    parser.add_argument("--gen_seed", default=0, type=int)
    parser.add_argument("--gen_steps", default=50, type=int)
    parser.add_argument("--max_faces_num", default=80000, type=int, help="max num of face, suggest 80000 for effect, 10000 for speed")
    parser.add_argument("--save_memory", default=False, action="store_true")
    parser.add_argument("--do_texture_mapping", default=False, action="store_true")
    parser.add_argument("--do_render", default=False, action="store_true")
    return parser.parse_args()

def main():
    args = get_args()
    
    assert not (args.text_prompt and args.image_prompt), "Specify either a text or an image prompt, not both"
    assert args.text_prompt or args.image_prompt, "Either a text prompt or an image prompt must be specified"
    
    os.makedirs(args.save_folder, exist_ok=True)
    
    rembg_model, image_to_views_model, views_to_mesh_model, text_to_image_model, gif_renderer = initialize_models(args)
    
    # Stage 1: Text to Image
    if args.text_prompt:
        res_rgb_pil = text_to_image(text_to_image_model, args)
        save_image(res_rgb_pil, os.path.join(args.save_folder, "img.jpg"))
    else:
        res_rgb_pil = Image.open(args.image_prompt)
    
    # Stage 2: Remove Background
    res_rgba_pil = remove_background(rembg_model, res_rgb_pil)
    save_image(res_rgba_pil, os.path.join(args.save_folder, "img_nobg.png"))
    
    # Stage 3: Image to Views
    (views_grid_pil, cond_img), _ = image_to_views(image_to_views_model, res_rgba_pil, args)
    save_image(views_grid_pil, os.path.join(args.save_folder, "views.jpg"))
    
    # Stage 4: Views to Mesh
    views_to_mesh(views_to_mesh_model, views_grid_pil, cond_img, args)
    
    # Stage 5: Render GIF
    if args.do_render:
        render_gif(gif_renderer, args)

if __name__ == "__main__":
    main()

未完......

更多详细的欢迎关注:杰哥新技术


原文地址:https://blog.csdn.net/m0_71062934/article/details/143665783

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!