自学内容网 自学内容网

【glm4-voice-9b 本地运行并测试 gradio+notebook】

## 安装环境

git clone --recurse-submodules https://github.com/THUDM/GLM-4-Voice
cd GLM-4-Voice
pip install -r requirements.txt
pip install accelerate

## 下载模型权重

modelscope download --model ZhipuAI/glm-4-voice-9b --local_dir /root/autodl-tmp/models/glm-4-voice-9b
modelscope download --model ZhipuAI/glm-4-voice-tokenizer --local_dir /root/autodl-tmp/models/glm-4-voice-tokenizer
modelscope download --model ZhipuAI/glm-4-voice-decoder  --local_dir /root/autodl-tmp/models/glm-4-voice-decoder


## 测试


启动大模型

python model_server.py --host 0.0.0.0 --model-path /root/autodl-tmp/models/glm-4-voice-9b --port 10000 --dtype bfloat16 --device cuda:0

notebook运行

在下载的GLM-4-Voice文件夹中新建demo.ipynb,将下边的代码复制进去

import sys

sys.path.insert(0, "./cosyvoice")
sys.path.insert(0, "./third_party/Matcha-TTS")
import os, random, string,json,os.path,re, uuid,requests
import torch, torchaudio
from transformers import WhisperFeatureExtractor, AutoTokenizer
from speech_tokenizer.modeling_whisper import WhisperVQEncoder
from speech_tokenizer.utils import extract_speech_token
from flow_inference import AudioDecoder
flow_path = "/root/autodl-tmp/models/glm-4-voice-decoder"
model_path = "/root/autodl-tmp/models/glm-4-voice-9b"
tokenizer_path = "/root/autodl-tmp/models/glm-4-voice-tokenizer"

flow_config = os.path.join(flow_path, "config.yaml")
flow_checkpoint = os.path.join(flow_path, "flow.pt")
hift_checkpoint = os.path.join(flow_path, "hift.pt")
device = "cuda"

# Speech tokenizer
whisper_model = WhisperVQEncoder.from_pretrained(tokenizer_path).eval().to(device)
feature_extractor = WhisperFeatureExtractor.from_pretrained(tokenizer_path)

# GLM
glm_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Flow & Hift
audio_decoder = AudioDecoder(
    config_path=flow_config,
    flow_ckpt_path=flow_checkpoint,
    hift_ckpt_path=hift_checkpoint,
    device=device,
)
def inference_fn(
    temperature: float = 0.3,
    top_p: float = 0.9,
    max_new_token: int = 2000,
    input_mode="text",
    audio_path: str | None = None,
    input_text: str | None = None,
    history: list[dict] = [],
    history_tokens: str = "",
    save_dir: str = "audio-tmp",
):
    os.makedirs(save_dir, exist_ok=True)
    system_prompt = (
        "User will provide you with a text/speech instruction. Do it step by step. First, "
        "think about the instruction and respond in a interleaved manner, "
        "with 13 text token followed by 26 audio tokens."
    )
    if input_mode == "audio":
        assert audio_path is not None
        history.append({"role": "user", "content": {"path": audio_path}})
        audio_tokens = extract_speech_token(
            whisper_model, feature_extractor, [audio_path]
        )[0]
        if len(audio_tokens) == 0:
            raise "No audio tokens extracted"
        audio_tokens = "".join([f"<|audio_{x}|>" for x in audio_tokens])
        audio_tokens = "<|begin_of_audio|>" + audio_tokens + "<|end_of_audio|>"
        user_input = audio_tokens
        system_prompt = (
            "User will provide you with a speech instruction. Do it step by step. First, "
            "think about the instruction and respond in a interleaved manner, "
            "with 13 text token followed by 26 audio tokens."
        )
    else:
        assert input_text is not None
        history.append({"role": "user", "content": input_text})
        user_input = input_text
        system_prompt = (
            "User will provide you with a text instruction. Do it step by step. First, "
            "think about the instruction and respond in a interleaved manner, "
            "with 13 text token followed by 26 audio tokens."
        )

    # Gather history
    inputs = history_tokens
    if "<|system|>" not in inputs:
        inputs += f"<|system|>\n{system_prompt}"
    inputs += f"<|user|>\n{user_input}<|assistant|>streaming_transcription\n"

    with torch.no_grad():
        response = requests.post(
            "http://localhost:10000/generate_stream",
            data=json.dumps(
                {
                    "prompt": inputs,
                    "temperature": temperature,
                    "top_p": top_p,
                    "max_new_tokens": max_new_token,
                }
            ),
            stream=True,
        )
        text_tokens, audio_tokens = [], []
        audio_offset = glm_tokenizer.convert_tokens_to_ids("<|audio_0|>")
        end_token_id = glm_tokenizer.convert_tokens_to_ids("<|user|>")
        complete_tokens = []
        prompt_speech_feat = torch.zeros(1, 0, 80).to(device)
        flow_prompt_speech_token = torch.zeros(1, 0, dtype=torch.int64).to(device)
        this_uuid = str(uuid.uuid4())
        tts_speechs = []
        tts_mels = []
        prev_mel = None
        is_finalize = False
        block_size = 10
        for chunk in response.iter_lines():
            token_id = json.loads(chunk)["token_id"]
            if token_id == end_token_id:
                is_finalize = True
            if len(audio_tokens) >= block_size or (is_finalize and audio_tokens):
                block_size = 20
                tts_token = torch.tensor(audio_tokens, device=device).unsqueeze(0)

                if prev_mel is not None:
                    prompt_speech_feat = torch.cat(tts_mels, dim=-1).transpose(1, 2)

                tts_speech, tts_mel = audio_decoder.token2wav(
                    tts_token,
                    uuid=this_uuid,
                    prompt_token=flow_prompt_speech_token.to(device),
                    prompt_feat=prompt_speech_feat.to(device),
                    finalize=is_finalize,
                )
                prev_mel = tts_mel

                tts_speechs.append(tts_speech.squeeze())
                tts_mels.append(tts_mel)
                # yield history, inputs, "", "", (
                #     22050,
                #     tts_speech.squeeze().cpu().numpy(),
                # ), None
                flow_prompt_speech_token = torch.cat(
                    (flow_prompt_speech_token, tts_token), dim=-1
                )
                audio_tokens = []

            if not is_finalize:
                complete_tokens.append(token_id)
                if token_id >= audio_offset:
                    audio_tokens.append(token_id - audio_offset)
                else:
                    text_tokens.append(token_id)
    tts_speech = torch.cat(tts_speechs, dim=-1).cpu()
    name = os.path.join(save_dir, ''.join(random.sample(string.ascii_letters, k=9))+'.wav')
    torchaudio.save(name, tts_speech.unsqueeze(0), 22050, format="wav")
    history.append(
        {
            "role": "assistant",
            "content": glm_tokenizer.decode(text_tokens, ignore_special_tokens=False),
            "audio_path": name,
            "type": "audio/wav",
        }
    )
    complete_text = glm_tokenizer.decode(
        complete_tokens, spaces_between_special_tokens=False
    )
    inputs += complete_text

    return history, inputs.strip()
# 初始化

from IPython.display import display, Audio

history, history_tokens = [], ''
# 修改input_text即可连续对话

history, history_tokens = inference_fn(
    input_text='用可爱的语气说:我是一头可爱的毛驴',
    history=history,
    history_tokens=history_tokens
)
print(f'Assistant:{history[-1]["content"]}')
display(Audio(history[-1]['audio_path'], autoplay=True))

 启动gradio前端UI

python web_demo.py \
--tokenizer-path  /root/autodl-tmp/models/glm-4-voice-tokenizer \
--model-path  /root/autodl-tmp/models/glm-4-voice-9b \
--flow-path /root/autodl-tmp/models/glm-4-voice-decoder

CodeWithGpu社区--一键测试icon-default.png?t=O83Ahttps://www.codewithgpu.com/i/THUDM/GLM-4-Voice/glm4-voice-9b


原文地址:https://blog.csdn.net/qq_39749966/article/details/143591710

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!