【glm4-voice-9b 本地运行并测试 gradio+notebook】

🕗 发布于 2024-11-12 23:16 glm4-voice-9b 语音生成

## 安装环境

git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice
cd GLM-4-Voice
pip install -r requirements.txt
pip install accelerate

## 下载模型权重

modelscope download --model ZhipuAI/glm-4-voice-9b --local_dir /root/autodl-tmp/models/glm-4-voice-9b
modelscope download --model ZhipuAI/glm-4-voice-tokenizer --local_dir /root/autodl-tmp/models/glm-4-voice-tokenizer
modelscope download --model ZhipuAI/glm-4-voice-decoder  --local_dir /root/autodl-tmp/models/glm-4-voice-decoder

## 测试

启动大模型

python model_server.py --host 0.0.0.0 --model-path /root/autodl-tmp/models/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0

notebook运行

在下载的GLM-4-Voice文件夹中新建demo.ipynb，将下边的代码复制进去

import sys

sys.path.insert(0, "./cosyvoice")
sys.path.insert(0, "./third_party/Matcha-TTS")
import os, random, string,json,os.path,re, uuid,requests
import torch, torchaudio
from transformers import WhisperFeatureExtractor, AutoTokenizer
from speech_tokenizer.modeling_whisper import WhisperVQEncoder
from speech_tokenizer.utils import extract_speech_token
from flow_inference import AudioDecoder

flow_path = "/root/autodl-tmp/models/glm-4-voice-decoder"
model_path = "/root/autodl-tmp/models/glm-4-voice-9b"
tokenizer_path = "/root/autodl-tmp/models/glm-4-voice-tokenizer"

flow_config = os.path.join(flow_path, "config.yaml")
flow_checkpoint = os.path.join(flow_path, "flow.pt")
hift_checkpoint = os.path.join(flow_path, "hift.pt")
device = "cuda"

# Speech tokenizer
whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_path).eval().to(device)
feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_path)

# GLM
glm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Flow & Hift
audio_decoder = AudioDecoder(
    config_path=flow_config,
    flow_ckpt_path=flow_checkpoint,
    hift_ckpt_path=hift_checkpoint,
    device=device,
)

def inference_fn(
    temperature: float = 0.3,
    top_p: float = 0.9,
    max_new_token: int = 2000,
    input_mode="text",
    audio_path: str | None = None,
    input_text: str | None = None,
    history: list[dict] = [],
    history_tokens: str = "",
    save_dir: str = "audio-tmp",
):
    os.makedirs(save_dir, exist_ok=True)
    system_prompt = (
        "User will provide you with a text/speech instruction. Do it step by step. First, "
        "think about the instruction and respond in a interleaved manner, "
        "with 13 text token followed by 26 audio tokens."
    )
    if input_mode == "audio":
        assert audio_path is not None
        history.append({"role": "user", "content": {"path": audio_path}})
        audio_tokens = extract_speech_token(
            whisper_model, feature_extractor, [audio_path]
        )[0]
        if len(audio_tokens) == 0:
            raise "No audio tokens extracted"
        audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens])
        audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>"
        user_input = audio_tokens
        system_prompt = (
            "User will provide you with a speech instruction. Do it step by step. First, "
            "think about the instruction and respond in a interleaved manner, "
            "with 13 text token followed by 26 audio tokens."
        )
    else:
        assert input_text is not None
        history.append({"role": "user", "content": input_text})
        user_input = input_text
        system_prompt = (
            "User will provide you with a text instruction. Do it step by step. First, "
            "think about the instruction and respond in a interleaved manner, "
            "with 13 text token followed by 26 audio tokens."
        )

    # Gather history
    inputs = history_tokens
    if "<|system|>" not in inputs:
        inputs += f"<|system|>\n{system_prompt}"
    inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n"

    with torch.no_grad():
        response = requests.post(
            "http://localhost:10000/generate_stream",
            data=json.dumps(
                {
                    "prompt": inputs,
                    "temperature": temperature,
                    "top_p": top_p,
                    "max_new_tokens": max_new_token,
                }
            ),
            stream=True,
        )
        text_tokens, audio_tokens = [], []
        audio_offset = glm_tokenizer.convert_tokens_to_ids("<|audio_0|>")
        end_token_id = glm_tokenizer.convert_tokens_to_ids("<|user|>")
        complete_tokens = []
        prompt_speech_feat = torch.zeros(1, 0, 80).to(device)
        flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device)
        this_uuid = str(uuid.uuid4())
        tts_speechs = []
        tts_mels = []
        prev_mel = None
        is_finalize = False
        block_size = 10
        for chunk in response.iter_lines():
            token_id = json.loads(chunk)["token_id"]
            if token_id == end_token_id:
                is_finalize = True
            if len(audio_tokens) >= block_size or (is_finalize and audio_tokens):
                block_size = 20
                tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0)

                if prev_mel is not None:
                    prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2)

                tts_speech, tts_mel = audio_decoder.token2wav(
                    tts_token,
                    uuid=this_uuid,
                    prompt_token=flow_prompt_speech_token.to(device),
                    prompt_feat=prompt_speech_feat.to(device),
                    finalize=is_finalize,
                )
                prev_mel = tts_mel

                tts_speechs.append(tts_speech.squeeze())
                tts_mels.append(tts_mel)
                # yield history, inputs, "", "", (
                #     22050,
                #     tts_speech.squeeze().cpu().numpy(),
                # ), None
                flow_prompt_speech_token = torch.cat(
                    (flow_prompt_speech_token, tts_token), dim=-1
                )
                audio_tokens = []

            if not is_finalize:
                complete_tokens.append(token_id)
                if token_id >= audio_offset:
                    audio_tokens.append(token_id - audio_offset)
                else:
                    text_tokens.append(token_id)
    tts_speech = torch.cat(tts_speechs, dim=-1).cpu()
    name = os.path.join(save_dir, ''.join(random.sample(string.ascii_letters, k=9))+'.wav')
    torchaudio.save(name, tts_speech.unsqueeze(0), 22050, format="wav")
    history.append(
        {
            "role": "assistant",
            "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False),
            "audio_path": name,
            "type": "audio/wav",
        }
    )
    complete_text = glm_tokenizer.decode(
        complete_tokens, spaces_between_special_tokens=False
    )
    inputs += complete_text

    return history, inputs.strip()

# 初始化

from IPython.display import display, Audio

history, history_tokens = [], ''

# 修改input_text即可连续对话

history, history_tokens = inference_fn(
    input_text='用可爱的语气说：我是一头可爱的毛驴',
    history=history,
    history_tokens=history_tokens
)
print(f'Assistant:{history[-1]["content"]}')
display(Audio(history[-1]['audio_path'], autoplay=True))

启动gradio前端UI

python web_demo.py \
--tokenizer-path  /root/autodl-tmp/models/glm-4-voice-tokenizer \
--model-path  /root/autodl-tmp/models/glm-4-voice-9b \
--flow-path /root/autodl-tmp/models/glm-4-voice-decoder

CodeWithGpu社区--一键测试https://www.codewithgpu.com/i/THUDM/GLM-4-Voice/glm4-voice-9b

原文地址：https://blog.csdn.net/qq_39749966/article/details/143591710

免责声明：本站文章内容转载自网络资源，如本站内容侵犯了原著者的合法权益，可联系本站删除。更多内容请关注自学内容网（zxcms.com）！

上一篇：Netty篇（WebSocket）
下一篇：FPGA学习笔记#8 Vitis HLS优化总结和案例程序的优化

@Autowired 和 @Resource思考（注入redisTemplate时发现一些奇怪的现象）
@Autowired 和 @Resource思考（注入redisTemplate时发现一些奇怪的现象）
阅读更多2024-11-16
25浙江省考-28天学行测-Day5 & Day6-判断推理（中）
1、另有他因2、因果倒置3、否定此因4、根本原因5、三圈质疑1、排除他因2、四圈支持1、无论据有结论2、有论据有结论3、质疑论证过程4、严谨逻辑关系5、比例类论证1、解释说明2、增加论据3、断点
阅读更多2024-11-16
Rust 入门指南（零）：安装及 Cargo 管理器
系列第 0 篇，简单介绍了 Rust 的基本安装和配置流程，以及如何通过 Cargo 创建和管理项目。
阅读更多2024-11-16
Centos使用人大金仓ksql
Centos使用人大金仓ksql
阅读更多2024-11-16
多模态大模型简介
多模态大模型是机器学习领域的一个新兴趋势，它结合了文本、图像、音频等多种数据模态，以实现更全面和深入的信息理解和处理。这种模型能够处理跨模态任务，如图像标注、视觉问答、文本到图像的生成等，是人工智能领
阅读更多2024-11-16
javaScript交互补充2（动画函数封装）
如果多个元素都使用这个动画函数，每次都要var 声明定时器，我们可以给不同元素使用不同的定时器（自己用自己的定时器）核心原理：利用js是一门动态语言，可以很方便的给当前对象添加属性。缓动动画就是让元素
阅读更多2024-11-16
【鸿蒙开发】第十五章 H5与端侧交互、Cookies以及Web调试
Web组件支持在应用拦截到页面请求后自定义响应请求能力。开发者通过接口来实现自定义资源请求响应。自定义请求能力可以用于开发者自定义Web页面响应、自定义文件资源响应等场景。Web网页上发起资源加载请求
阅读更多2024-11-16
javaScript交互补充（元素的三大系列）
使用client系列的相关属性来获取元素可视区的相关信息，可以动态的得到该元素的边框大小，元素大小等。返回自身包括padding，内容区宽度，不含边框，返回数值不带单位。返回自身包括padding，内
阅读更多2024-11-16
django 过滤器的执行
默认情况下，会执行，特别是如果在中配置了它，或者没有禁用它。禁用过滤器：可以通过在APIView中设置来禁用过滤器，避免执行。手动调用过滤器：如果需要，你可以在视图中手动调用来过滤查询集。自定义过滤器
阅读更多2024-11-16
在Ubuntu 24.04 LTS上安装飞桨PaddleX
首先，请运行“ubuntu-drivers devices”命令列出当前可用的硬件设备，确保Ubuntu 24.04.1 LTS已发现安装在桌面计算机中的英伟达显卡，如下图所示。是基于飞桨框架构建的一
阅读更多2024-11-16