56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
import os
|
|
import shutil
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
|
|
from langchain_community.vectorstores import Chroma
|
|
from langchain_community.embeddings import OllamaEmbeddings
|
|
|
|
# Configuration
|
|
SOURCE_DIRECTORY = "/data/rag_source"
|
|
PERSIST_DIRECTORY = "/data/db"
|
|
CHUNK_SIZE = 1000
|
|
CHUNK_OVERLAP = 200
|
|
|
|
def build_vector_store():
|
|
"""
|
|
Builds the vector store from the documents in the source directory.
|
|
"""
|
|
print("Starting to build vector store...")
|
|
|
|
# Clean up old database
|
|
if os.path.exists(PERSIST_DIRECTORY):
|
|
print(f"Removing old database from {PERSIST_DIRECTORY}")
|
|
shutil.rmtree(PERSIST_DIRECTORY)
|
|
|
|
# Load the documents
|
|
print(f"Loading documents from {SOURCE_DIRECTORY}...")
|
|
loader = DirectoryLoader(SOURCE_DIRECTORY, glob="**/*.*", show_progress=True, use_multithreading=True)
|
|
documents = loader.load()
|
|
|
|
if not documents:
|
|
print("No documents found. Exiting.")
|
|
return
|
|
|
|
print(f"Loaded {len(documents)} documents.")
|
|
|
|
# Split the documents into chunks
|
|
print("Splitting documents into chunks...")
|
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
|
|
texts = text_splitter.split_documents(documents)
|
|
print(f"Split documents into {len(texts)} chunks.")
|
|
|
|
# Create the embeddings
|
|
print("Creating embeddings...")
|
|
embeddings = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)
|
|
|
|
# Create and persist the vector store
|
|
print("Creating and persisting vector store...")
|
|
db = Chroma.from_documents(texts, embeddings, persist_directory=PERSIST_DIRECTORY)
|
|
print("Vector store created successfully.")
|
|
db.persist()
|
|
print("Vector store persisted.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
build_vector_store()
|