|
import logging |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
import hashlib |
|
|
|
def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200): |
|
""" |
|
Chunk a list of page contents into smaller segments with document ID metadata. |
|
|
|
Args: |
|
page_list (list): List of strings, each string being the content of a page. |
|
doc_id (str): Unique identifier for the document. |
|
chunk_size (int): Maximum size of each chunk (default: 1000 characters). |
|
chunk_overlap (int): Overlap between chunks (default: 200 characters). |
|
|
|
Returns: |
|
list: List of dictionaries, each containing 'text', 'source', and 'doc_id'. |
|
""" |
|
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) |
|
documents = [] |
|
seen_hashes = set() |
|
|
|
for page_num, page_content in enumerate(page_list, start=1): |
|
if not page_content or not isinstance(page_content, str): |
|
continue |
|
|
|
|
|
chunks = text_splitter.split_text(page_content) |
|
|
|
for i, chunk in enumerate(chunks): |
|
|
|
chunk_hash = hashlib.sha256(chunk.encode()).hexdigest() |
|
|
|
|
|
if chunk_hash in seen_hashes: |
|
continue |
|
|
|
|
|
source = f"doc_{doc_id}_page_{page_num}_chunk_{i}" |
|
|
|
|
|
documents.append({ |
|
'text': chunk, |
|
'source': source, |
|
'doc_id': doc_id |
|
}) |
|
seen_hashes.add(chunk_hash) |
|
|
|
logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks") |
|
return documents |