import logging from langchain.text_splitter import RecursiveCharacterTextSplitter import hashlib def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200): """ Chunk a list of page contents into smaller segments with document ID metadata. Args: page_list (list): List of strings, each string being the content of a page. doc_id (str): Unique identifier for the document. chunk_size (int): Maximum size of each chunk (default: 1000 characters). chunk_overlap (int): Overlap between chunks (default: 200 characters). Returns: list: List of dictionaries, each containing 'text', 'source', and 'doc_id'. """ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) documents = [] seen_hashes = set() # Track hashes of chunks to avoid duplicates for page_num, page_content in enumerate(page_list, start=1): # Start page numbering at 1 if not page_content or not isinstance(page_content, str): continue # Skip empty or invalid pages # Split the page content into chunks chunks = text_splitter.split_text(page_content) for i, chunk in enumerate(chunks): # Generate a unique hash for the chunk chunk_hash = hashlib.sha256(chunk.encode()).hexdigest() # Skip if the chunk is a duplicate if chunk_hash in seen_hashes: continue # Create source identifier (e.g., "doc_123_page_1_chunk_0") source = f"doc_{doc_id}_page_{page_num}_chunk_{i}" # Add the chunk with doc_id as metadata documents.append({ 'text': chunk, 'source': source, 'doc_id': doc_id }) seen_hashes.add(chunk_hash) logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks") return documents