基于FastAPI实现本地大模型API封装调用

🕗 发布于 2024-11-19 12:56 fastapi 大模型 人工智能 算法

关于FastAPI
- FastAPI 是一个现代、快速（高性能）的 Python Web 框架，用于构建基于标准 Python 类型提示的 API。它以简洁、直观和高效的方式提供工具，特别适合开发现代 web 服务和后端应用程序。
问题：_pad() got an unexpected keyword argument ‘padding_side’
- 解决：降级 transformers，pip install transformers==4.34.0，同时更改相关包版本以实现适配，pip install accelerate==0.25.0,pip install huggingface_hub==0.16.4
问题：报错500
- 服务器防火墙问题，只能在指定端口访问
- post请求的参数通过request body传递，需要以 application/json 的方式，请求body体
  - 以postman测试为例：Body中选择“raw”，则对应的Headers中的“Content-Type”是“application/json”，参数形式是{"content":"有什么推荐的咖啡吗"}
代码实现
- fastapi_demo.py（运行开启服务）
- post.py（服务测试）

# fastapi_demo.py（运行开启服务）
from fastapi import FastAPI, Request, HTTPException
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
import uvicorn
import json
import datetime
import torch
import logging

print(f"CUDA 是否可用: {torch.cuda.is_available()}")
print(f"当前 CUDA 版本: {torch.version.cuda}")
print(f"当前可用 CUDA 设备数量: {torch.cuda.device_count()}")
 
# 设置设备参数
DEVICE = "cuda"  # 使用CUDA
DEVICE_ID = "0"  # CUDA设备ID，如果未设置则为空
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE  # 组合CUDA设备信息
 
# 清理GPU内存函数
def torch_gc():
    if torch.cuda.is_available():  # 检查是否可用CUDA
        with torch.cuda.device(CUDA_DEVICE):  # 指定CUDA设备
            torch.cuda.empty_cache()  # 清空CUDA缓存
            torch.cuda.ipc_collect()  # 收集CUDA内存碎片
 
# 构建 chat 模版
def bulid_input(prompt, history=[], system_message=None):
    system_format = 'system\n\n{content}\n'
    user_format = 'user\n\n{content}\n'
    assistant_format = 'assistant\n\n{content}\n'
 
    prompt_str = ''
 
    # 添加system消息
    if system_message:
        prompt_str += system_format.format(content=system_message)
 
    # 拼接历史对话
    for item in history:
        if item['role'] == 'user':
            prompt_str += user_format.format(content=item['content'])
        else:
            prompt_str += assistant_format.format(content=item['content'])
 
    # 添加当前用户输入
    prompt_str += user_format.format(content=prompt)
 
    return prompt_str
 
# 创建FastAPI应用
app = FastAPI()
 
# 添加GET请求处理
@app.get("/")
async def read_root():
    return {"message": "Welcome to the API. Please use POST method to interact with the model."}
 
@app.get('/favicon.ico')
async def favicon():
    return {'status': 'ok'}
 
# 处理POST请求的端点
@app.post("/")
async def create_item(request: Request):
    try:
        json_post_raw = await request.json()
        json_post = json.dumps(json_post_raw)
        json_post_list = json.loads(json_post)
        prompt = json_post_list.get('prompt')
        
        if not prompt:
            raise HTTPException(status_code=400, detail="提示词不能为空")

        history = json_post_list.get('history', [])
        system_message = json_post_list.get('system_message')

        # 添加输入验证的日志
        logging.info(f"收到请求: prompt={prompt}, history={history}, system_message={system_message}")

        input_str = bulid_input(prompt=prompt, history=history, system_message=system_message)
        try:
            input_ids = process_input(input_str).to(CUDA_DEVICE)
        except Exception as e:
            logging.error(f"Tokenizer 错误: {str(e)}")
            raise HTTPException(status_code=500, detail=f"Tokenizer 处理失败: {str(e)}")

        try:
            generated_ids = model.generate(
                input_ids=input_ids, max_new_tokens=1024, do_sample=True,
                top_p=0.5, temperature=0.95, repetition_penalty=1.1
            )
        except Exception as e:
            logging.error(f"模型生成错误: {str(e)}")
            raise HTTPException(status_code=500, detail=f"模型生成失败: {str(e)}")

        outputs = generated_ids.tolist()[0][len(input_ids[0]):]
        response = tokenizer.decode(outputs)
        response = response.strip().replace('assistant\n\n', '').strip()  # 解析 chat 模版
 
        now = datetime.datetime.now()  # 获取当前时间
        time = now.strftime("%Y-%m-%d %H:%M:%S")  # 格式化时间为字符串
        # 构建响应JSON
        answer = {
            "response": response,
            "status": 200,
            "time": time
        }
        # 构建日志信息
        log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(response) + '"'
        print(log)  # 打印日志
        torch_gc()  # 执行GPU内存清理
        return answer  # 返回响应

    except json.JSONDecodeError:
        raise HTTPException(status_code=400, detail="无效的 JSON 格式")
    except Exception as e:
        logging.error(f"处理请求时发生错误: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
 
# 主函数入口
if __name__ == '__main__':
    # 首先检查可用的GPU数量
    gpu_count = torch.cuda.device_count()
    if int(DEVICE_ID) >= gpu_count:
        raise ValueError(f"指定的DEVICE_ID ({DEVICE_ID}) 无效。系统只有 {gpu_count} 个GPU设备(0-{gpu_count-1})")
    
    # 设置当前CUDA设备
    torch.cuda.set_device(int(DEVICE_ID))
    
    model_name_or_path = '/data/user23262833/MemoryStrategy/ChatGLM-Finetuning/chatglm3-6b（需要填写你的模型位置所在路径）'
    
    # 修改 tokenizer 初始化
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path,
        use_fast=False,
        trust_remote_code=True,
        padding_side='left'  # 直接在初始化时设置
    )
    
    # 更简单的 process_input 实现
    def process_input(text):
        inputs = tokenizer.encode(text, return_tensors='pt')
        return inputs if torch.is_tensor(inputs) else torch.tensor([inputs])
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path, 
        device_map={"": int(DEVICE_ID)},  # 明确指定设备映射
        torch_dtype=torch.float16
    )
 
    # 启动FastAPI应用
    # 用6006端口可以将autodl的端口映射到本地，从而在本地使用api
    uvicorn.run(app, host='需填写你的本地或者服务器ip', port=6006, workers=1)  # 在指定端口和主机上启动应用

# post.py
import requests
import json
 
def get_completion(prompt):
    try:
        headers = {'Content-Type': 'application/json'}
        data = {"prompt": prompt}
        response = requests.post(url='需填写你的本地或者服务器ip:6006', headers=headers, data=json.dumps(data))
        
        # 检查响应状态码
        response.raise_for_status()
        
        # 添加响应内容的打印，用于调试
        print("Response content:", response.text)
        
        return response.json()['response']
    except requests.exceptions.RequestException as e:
        print(f"请求错误: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"JSON解析错误: {e}")
        return None
    except KeyError as e:
        print(f"响应中缺少 'response' 键: {e}")
        return None
 
# 测试代码
response = get_completion('你好')
if response is not None:
    print(response)

测试结果
参考博文：https://blog.csdn.net/qq_34717531/article/details/142092636?spm=1001.2101.3001.6661.1&utm_medium=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7EOPENSEARCH%7EPaidSort-1-142092636-blog-139909949.235%5Ev43%5Epc_blog_bottom_relevance_base5&depth_1-utm_source=distribute.pc_relevant_t0.none-task-blog-2%7Edefault%7EOPENSEARCH%7EPaidSort-1-142092636-blog-139909949.235%5Ev43%5Epc_blog_bottom_relevance_base5&utm_relevant_index=1

原文地址：https://blog.csdn.net/qq_45734745/article/details/143862063

免责声明：本站文章内容转载自网络资源，如本站内容侵犯了原著者的合法权益，可联系本站删除。更多内容请关注自学内容网（zxcms.com）！

使用docker安装RocketMQ
v /docker/rocketmq/data/namesrv/logs:/root/logs | 把容器内的/root/logs日志目录挂载到宿主机的 /docker/rocketmq/data/n
阅读更多2024-12-02
Y20030025基于php+mysql的幼儿健康管理系统设计与实现源代码配置文档
在信息化时代的浪潮中，幼儿健康管理面临着前所未有的挑战与机遇。为了更好地满足家长和幼儿园对幼儿健康管理的需求，我们致力于开发一套基于PHP的幼儿健康管理系统。这一系统的开发，旨在通过技术手段提升幼儿健
阅读更多2024-12-02
Github 2024-12-01 开源项目月报 Top20
根据Github Trendings的统计，本月(2024-12-01统计)共有20个项目上榜。
阅读更多2024-12-02
Redis中常见的延迟问题
使用复杂度高的命令，执行命令时就会耗时存储大key：如果一个key写入的数据非常大，Redis在分配内存、删除大key时都会耗时，并且持久化AOF的写回策略是always时会影响Redis性能集中过期
阅读更多2024-12-02
C#VB.NET开发整体一键国际化显示
-------------------------------项目启动根据设置的语言加载对应语言内容进行显示。---------------------------翻译完毕后多语言系统进行语言编译生成
阅读更多2024-12-02
数据结构-简单排序
【代码】数据结构-简单排序。
阅读更多2024-12-02
亚马逊IP关联是什么？
亚马逊IP关联是指在亚马逊平台上使用的IP地址或IP地址段被认定为相关联，可能导致一些特定的限制或操作问题。这种关联通常是由于多个账户或操作在同一IP地址下进行，或者存在多个操作被认为有关联的迹象，可
阅读更多2024-12-02
前端入门指南：模块打包器是什么？模块打包器的工作原理与实践
在前端开发的生态系统中，随着项目复杂度和规模的不断提升，代码管理和优化变得至关重要。模块化开发作为一种有效的代码组织方式，极大地提升了代码的可维护性和复用性。然而，面对大量的模块和复杂的依赖关系，如何
阅读更多2024-12-02
vue3项目中使用星火API
通过阅读文档可知我们需要返回给api接口的数据需要authorization，然后对话内容需要设置message设置role为user则content的内容是使用者的提问。在node环境epxress
阅读更多2024-12-02
当你访问一个网站时，数据是怎么传输的呢
电脑访问网站时数据的传输过程是一个涉及多个层次、多种协议以及众多网络设备协同工作的复杂体系。从域名解析开始，到 HTTP 请求构建、各层协议的封装、数据在网络中的传输，再到服务器端的处理和响应返回，每
阅读更多2024-12-02

基于FastAPI实现本地大模型API封装调用

相关文章