12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- import pymongo
- import numpy as np
- import faiss
- import time
- from bson.objectid import ObjectId
- # MongoDB连接配置
- client = pymongo.MongoClient("mongodb://root:faiss_image_search@localhost:27017/")
- db = client["faiss_index"]
- collection = db["mat_vectors"]
- collection.create_index([("product_id", 1)], unique=True)
- collection.create_index([("faiss_id", 1)], unique=True)
- # FAISS配置
- dimension = 2048
- base_index = faiss.IndexFlatL2(dimension)
- index = faiss.IndexIDMap(base_index)
- # 生成随机向量
- def generate_random_vector(dimension):
- return np.random.random(dimension).astype('float32')
- # 插入100万条数据
- def insert_million_records():
- batch_size = 10000 # 每批插入的数据量
- total_records = 200000
- start_time = time.time()
- for i in range(0, total_records, batch_size):
- batch = []
- for j in range(batch_size):
- faiss_id = i + j
- vector = generate_random_vector(dimension)
- index.add_with_ids(np.array([vector]), np.array([faiss_id]))
- record = {
- "_id": ObjectId(),
- "product_id": ObjectId(),
- "faiss_id": faiss_id,
- "vector": vector.tolist()
- }
- batch.append(record)
- collection.insert_many(batch)
- print(f"Inserted {i + batch_size} records")
- end_time = time.time()
- print(f"Total time taken: {end_time - start_time} seconds")
- if __name__ == "__main__":
- insert_million_records()
|