LLM之milvus的使用记录
前言
试试这个数据库
Milvus安装
python接口:pip install pymilvus
docker安装:
通过docker-compose
+ xxx.yml
文件实现
wget https://github.com/milvus-io/milvus/releases/download/v2.4.0/milvus-standalone-docker-compose.yml -O docker-compose.yml
sudo docker-compose up -d
如果没有安装过docker-compose,会报
执行下面这行代码,再重复上面的代码,当然如果你网络不好,可能就需要考虑添加镜像源啥的了
sudo apt install docker-compose
安装完之后,运行
docker ps
会显示你在运行中的容器,这边安装好之后会出现
Milvus数据类型与python对应的数据类型
Milvus | Python |
DataType.INT64 | numpy.int64 |
DataType.INT32 | numpy.int32 |
DataType.INT16 | numpy.int16 |
DataType.BOOL | bool |
DataType.FLOAT | numpy.float32 |
DataType.DOUBLE | numpy.double |
DataType.ARRAY | list |
DataType.VARCHAR | str |
DataType.JSON | dict |
FLOAT_VECTOR(浮点数向量) | numpy.ndarray or list (元素为numpy.float) |
Milvus操作
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection,utility
from pymilvus import MilvusClient
from tqdm import tqdm
from datetime import datetime
class milvus_db():
def __init__(self,url:str='0.0.0.0',collection_name:str='data_store'):
# milvus_client = MilvusClient(uri="./milvus_demo.db")
# collection_name = "my_rag_collection"
connections.connect(host=url, port="19530")
#self.delete_collection(collection_name)
if utility.has_collection(collection_name):
self.collection = Collection(name=collection_name)
else:
schema = self.get_schema()
self.collection = Collection(name=collection_name, schema=schema)
print(self.collection.schema)
def get_schema(self):
id = FieldSchema(name="id", dtype=DataType.VARCHAR,max_length=128,is_primary=True) # 主键索引
text = FieldSchema(name="text", dtype=DataType.VARCHAR,max_length=58192)
file_name = FieldSchema(name="file_name", dtype=DataType.VARCHAR,max_length=512)
text_embedding = FieldSchema(name="text_embedding", dtype=DataType.FLOAT_VECTOR,dim=1024) # 向量,dim=2代表向量只有两列,自己的数据的话一个向量有多少个元素就多少列
schema = CollectionSchema(fields=[id, text,file_name,text_embedding], description="文本与文本嵌入存储") # 描述
return schema
def change_collection(self,collection_name):
schema = self.get_schema()
self.collection = Collection(name=collection_name,schema=schema)
def delete_collection(self,collection_name):
utility.drop_collection(collection_name)
def create_index(self,metric_type='L2',index_name='L2'):
#utility.drop_collection(collection_name=collection_name)
# self.collection = Collection(name=collection_name, schema=schema)
index_params = {
"index_type": "AUTOINDEX",
"metric_type":metric_type,
"params": {}
}
self.collection.create_index(
field_name="text_embedding",
index_params=index_params,
index_name=index_name
)
self.collection.load()
def drop_index(self):
self.collection.release()
self.collection.drop_index()
def insert_data(self,text_id_list,text_list,file_name_list,text_embedding_list):
data_list = []
start = datetime.now()
for id,text,file_name,text_embedding in zip(text_id_list,text_list,file_name_list,text_embedding_list):
#data_list.append([[id],[text],[file_name],[text_embedding]])
self.collection.insert([[id],[text],[file_name],[text_embedding]])
end = datetime.now()
print(f'插入数据消化时间{end-start}')
def search(self,query_embedding, top_k=10,metric_type='L2'):
search_params = {
"metric_type": metric_type,
"params": {"level": 2}
}
results = self.collection.search(
[query_embedding],
anns_field="text_embedding",
param=search_params,
limit=top_k,
output_fields=["text", "file_name"]
)
return results
def list_collections(self):
collections_list = utility.list_collections()
return collections_list
def reranker_init(self,model_name_or_path,device="cpu"):
self.reranker = bge_rf = BGERerankFunction(
model_name=model_name_or_path, # Specify the model name. Defaults to `BAAI/bge-reranker-v2-m3`.
device="cpu" # Specify the device to use, e.g., 'cpu' or 'cuda:0'
)
def rereank(self,query,serach_result,top_k,rerank_client=None):
documents_list = [i.entity.get('text') for i in serach_result[0]]
#如果外部传入非milvus集成的rerank
if rerank_client:
response = rerank_client.rerank(
query=query,
documents=documents_list,
top_n=top_k,
)
rerank_results = response['results']
results = []
for i in rerank_results:
index = i['index']
results.append(serach_result[0][index])
else:
results = self.reranker(
query=query,
documents=documents_list,
top_k=top_k,
)
return results
Milvus可视化
安装好milvus docker之后,哪怕milvus在运行着都可以继续接下来的步骤哦。
因为输入下面的代码就行
# 执行命令,加个 -d 在后台运行
docker run -d -p 8000:3000 -e MILVUS_URL=127.0.0.1:19530 zilliz/attu:v2.2.8
如果像我一样在服务器上跑着,想在本地电脑上看的话,就在MILVUS_URL输入服务器的ip就行啦,然后attu:v 版本尽量接近你milvus的版本就行
之后输入对应链接就行啦。例如我的http://192.0.0.181:8000
参考链接:Milvus向量数据库基础用法及注意细节
欢迎大家点赞或收藏
大家的点赞或收藏可以鼓励作者加快更新~
原文地址:https://blog.csdn.net/weixin_44598554/article/details/143973443
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!