In [1]:
from google.colab import drive
drive.mount('/content/drive')
base_dir = "/content/drive/MyDrive/huggingface-rag"
In [2]:
!pip install qdrant-client sentence-transformers fastembed
In [3]:
import json
import os
import uuid
from tqdm import tqdm
from qdrant_client import QdrantClient, models
from sentence_transformers import SentenceTransformer
from fastembed import SparseTextEmbedding
In [4]:
from sentence_transformers import SentenceTransformer
output_path = f"{base_dir}/ft-jina-transformers-v1"
dense_model = SentenceTransformer(output_path, trust_remote_code=True)
dense_dim = dense_model.get_sentence_embedding_dimension()
print(f"Dense model embedding size: {dense_dim}")
sparse_model = SparseTextEmbedding(model_name="prithivida/Splade_PP_en_v1")
In [5]:
import os
from qdrant_client import QdrantClient
from qdrant_client.http import models
qdrant_path = f"{base_dir}/qdrant_hybrid_db"
lock_file = os.path.join(qdrant_path, ".lock")
if os.path.exists(lock_file):
try:
os.remove(lock_file)
print(f"Removed stale lock file: {lock_file}")
except Exception as e:
print(f"Warning: Could not remove lock file: {e}")
client = QdrantClient(path=qdrant_path)
collection_name = 'huggingface_transformers_docs'
client.delete_collection(collection_name)
client.create_collection(
collection_name=collection_name,
vectors_config={
"text-dense": models.VectorParams(
size=dense_dim,
distance=models.Distance.COSINE,
)
},
sparse_vectors_config={
"text-sparse": models.SparseVectorParams(
index=models.SparseIndexParams(
on_disk=True,
)
)
}
)
print(f"Collection '{collection_name}' created.")
In [6]:
from tqdm import tqdm
import uuid
import json
import hashlib
batch_size = 64
chunked_path = f"{base_dir}/chunks.jsonl"
with open(chunked_path, 'r', encoding='utf-8') as f_in:
total_docs = sum(1 for line in f_in if line.strip())
with open(chunked_path, 'r', encoding='utf-8') as f_in:
batch_docs = []
for line in tqdm(f_in, desc="Building document index", total=total_docs):
line = line.strip()
if not line:
continue
doc = json.loads(line)
batch_docs.append(doc)
if len(batch_docs) >= batch_size:
batch_texts = [doc['text'] for doc in batch_docs]
dense_vectors = dense_model.encode(batch_texts, convert_to_tensor=False).tolist()
sparse_vectors = list(sparse_model.embed(batch_texts))
points = []
for idx, (d_vec, s_vec) in enumerate(zip(dense_vectors, sparse_vectors)):
doc = batch_docs[idx]
doc_id_hash = hashlib.md5(doc['text'].encode('utf-8')).hexdigest()
payload = {
"text": doc["text"],
"source": doc.get("metadata", {}).get("source", "unknown"),
"headers": doc.get("metadata", {}).get("headers", []),
"full_metadata": doc.get("metadata", {})
}
qdrant_sparse_vec = models.SparseVector(
indices=s_vec.indices.tolist(),
values=s_vec.values.tolist()
)
points.append(models.PointStruct(
id=doc_id_hash,
payload=payload,
vector={
"text-dense": d_vec,
"text-sparse": qdrant_sparse_vec
}
))
client.upsert(
collection_name=collection_name,
points=points
)
batch_docs = []
if batch_docs:
print(f"Indexing final batch of {len(batch_docs)} documents...")
batch_texts = [doc["text"] for doc in batch_docs]
dense_vectors = dense_model.encode(batch_texts, convert_to_tensor=False).tolist()
sparse_vectors = list(sparse_model.embed(batch_texts))
points = []
for idx, (d_vec, s_vec) in enumerate(zip(dense_vectors, sparse_vectors)):
doc = batch_docs[idx]
doc_id_hash = hashlib.md5(doc['text'].encode('utf-8')).hexdigest()
payload = {
"text": doc["text"],
"source": doc.get("metadata", {}).get("source", "unknown"),
"headers": doc.get("metadata", {}).get("headers", []),
"full_metadata": doc.get("metadata", {})
}
qdrant_sparse_vec = models.SparseVector(
indices=s_vec.indices.tolist(),
values=s_vec.values.tolist()
)
points.append(models.PointStruct(
id=doc_id_hash,
payload=payload,
vector={
"text-dense": d_vec,
"text-sparse": qdrant_sparse_vec
}
))
client.upsert(
collection_name=collection_name,
points=points
)
print("Index building complete")
In [7]:
import time
import os
from qdrant_client import QdrantClient, models
def print_results(results, method_name):
print(f"\n--- {method_name} Results ---")
if not results:
print("No results found.")
return
for i, point in enumerate(results):
text_preview = point.payload['text'][:100].replace('\n', ' ')
source = point.payload.get('source', 'Unknown Source')
score = point.score
print(f"{i+1}. [{score:.4f}] {source} | {text_preview}...")
lock_file = os.path.join(qdrant_path, ".lock")
if os.path.exists(lock_file):
try:
os.remove(lock_file)
print(f"Removed stale lock file: {lock_file}")
except Exception as e:
print(f"Warning: Could not remove lock file: {e}")
query_text = "How to use AutoModel?"
print(f"2. Connecting to Qdrant at {qdrant_path}...")
client = QdrantClient(path=qdrant_path)
print(f"3. Processing Query: '{query_text}'")
query_dense_vec = dense_model.encode(query_text).tolist()
query_sparse_gen = list(sparse_model.embed([query_text]))[0]
query_sparse_vec = models.SparseVector(
indices=query_sparse_gen.indices.tolist(),
values=query_sparse_gen.values.tolist()
)
# Using query_points with 'using' parameter
results_dense = client.query_points(
collection_name=collection_name,
query=query_dense_vec,
using="text-dense",
limit=5,
with_payload=True
).points
print_results(results_dense, "ONLY DENSE (Semantic)")
# Using query_points with 'using' parameter
results_sparse = client.query_points(
collection_name=collection_name,
query=query_sparse_vec,
using="text-sparse",
limit=5,
with_payload=True
).points
print_results(results_sparse, "ONLY SPARSE (Keyword/SPLADE)")
prefetch_dense = models.Prefetch(
query=query_dense_vec,
using="text-dense",
limit=20, # Expanded recall for RRF
)
prefetch_sparse = models.Prefetch(
query=query_sparse_vec,
using="text-sparse",
limit=20,
)
# Fixed: method -> fusion argument name
results_hybrid = client.query_points(
collection_name=collection_name,
prefetch=[prefetch_dense, prefetch_sparse],
query=models.FusionQuery(fusion=models.Fusion.RRF),
limit=5,
with_payload=True
).points
print_results(results_hybrid, "HYBRID (RRF Fusion)")
In [7]: