Spaces:

chaaim123
/

demo09

Sleeping

App Files Files Community

demo09 / retriever /chunk_documents.py

chaaim123

Create retriever/chunk_documents.py

aa2059f verified about 1 month ago

raw

history blame contribute delete

2.04 kB

	import logging
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import hashlib

	def chunk_documents(page_list, doc_id, chunk_size=1000, chunk_overlap=200):
	"""
	Chunk a list of page contents into smaller segments with document ID metadata.

	Args:
	page_list (list): List of strings, each string being the content of a page.
	doc_id (str): Unique identifier for the document.
	chunk_size (int): Maximum size of each chunk (default: 1000 characters).
	chunk_overlap (int): Overlap between chunks (default: 200 characters).

	Returns:
	list: List of dictionaries, each containing 'text', 'source', and 'doc_id'.
	"""
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	documents = []
	seen_hashes = set() # Track hashes of chunks to avoid duplicates

	for page_num, page_content in enumerate(page_list, start=1): # Start page numbering at 1
	if not page_content or not isinstance(page_content, str):
	continue # Skip empty or invalid pages

	# Split the page content into chunks
	chunks = text_splitter.split_text(page_content)

	for i, chunk in enumerate(chunks):
	# Generate a unique hash for the chunk
	chunk_hash = hashlib.sha256(chunk.encode()).hexdigest()

	# Skip if the chunk is a duplicate
	if chunk_hash in seen_hashes:
	continue

	# Create source identifier (e.g., "doc_123_page_1_chunk_0")
	source = f"doc_{doc_id}_page_{page_num}_chunk_{i}"

	# Add the chunk with doc_id as metadata
	documents.append({
	'text': chunk,
	'source': source,
	'doc_id': doc_id
	})
	seen_hashes.add(chunk_hash)

	logging.info(f"Chunking of documents is done. Chunked the document to {len(documents)} numbers of chunks")
	return documents