# src/utils/drive_document_processor.py
from pathlib import Path
from typing import Dict, List, Any, Tuple
import logging
from fastapi import HTTPException

from src.utils.google_drive_service import GoogleDriveService
from src.utils.document_processor import DocumentProcessor
from src.vectorstores.chroma_vectorstore import ChromaVectorStore
from src.utils.logger import logger
from src.db.mongodb_store import MongoDBStore


class DriveDocumentProcessor:
    def __init__(
        self,
        google_service_account_path: str,
        folder_id: str,
        temp_dir: str,
        doc_processor: DocumentProcessor,
        mongodb: MongoDBStore  # Add MongoDB
    ):
        """
        Initialize Drive Document Processor

        Args:
            google_service_account_path (str): Path to Google service account credentials
            folder_id (str): Google Drive folder ID to process
            temp_dir (str): Directory for temporary files
            doc_processor (DocumentProcessor): Instance of DocumentProcessor
        """
        self.google_drive_service = GoogleDriveService(
            google_service_account_path)
        self.folder_id = folder_id
        self.temp_dir = Path(temp_dir)
        self.doc_processor = doc_processor
        self.mongodb = mongodb  # Store MongoDB instance

        # Create temp directory if it doesn't exist
        self.temp_dir.mkdir(parents=True, exist_ok=True)

        # Define supported MIME types
        self.supported_mime_types = {
            # Google Docs
            'application/vnd.google-apps.document': '.docx',

            # Microsoft Word Documents
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document': '.docx',
            'application/msword': '.doc',

            # Microsoft Excel Documents
            'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': '.xlsx',
            'application/vnd.ms-excel': '.xls',

            # Text Documents
            'text/plain': '.txt',
            'text/csv': '.csv',
            'text/markdown': '.md',
            'text/html': '.html',
            'text/xml': '.xml',
            'application/json': '.json',
            'application/rtf': '.rtf',

            # PDF Documents
            'application/pdf': '.pdf'
        }

        self.google_docs_export_types = {
            'application/vnd.google-apps.document': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
        }

    async def _cleanup_orphaned_documents(
        self,
        drive_files: List[Dict[str, Any]],
        vector_store: ChromaVectorStore
    ) -> Dict[str, Any]:
        """
        Clean up documents that exist in MongoDB but not in Google Drive

        Args:
            drive_files (List[Dict[str, Any]]): List of files from Google Drive
            vector_store (ChromaVectorStore): Vector store instance

        Returns:
            Dict[str, Any]: Cleanup statistics
        """
        try:
            # Get all documents from MongoDB
            mongo_docs = await self.mongodb.get_all_documents()

            # Create set of Google Drive file IDs
            drive_file_ids = {file['id'] for file in drive_files}

            deleted_count = 0
            failed_deletions = []

            # Check each MongoDB document
            for doc in mongo_docs:
                # Only process Google Drive documents
                if doc.get('source') != 'google_drive':
                    continue

                doc_id = doc.get('document_id')
                if not doc_id or doc_id not in drive_file_ids:
                    try:
                        # Delete from MongoDB
                        await self.mongodb.delete_document(doc_id)

                        # Delete from vector store
                        vector_store.delete_document(doc_id)

                        deleted_count += 1

                    except Exception as e:
                        logger.error(
                            f"Error deleting orphaned document {doc_id}: {str(e)}")
                        failed_deletions.append({
                            'document_id': doc_id,
                            'error': str(e)
                        })

            return {
                'orphaned_documents_deleted': deleted_count,
                'failed_deletions': failed_deletions
            }

        except Exception as e:
            logger.error(f"Error in cleanup_orphaned_documents: {str(e)}")
            raise

    async def process_documents(
        self,
        vector_store: ChromaVectorStore,
        # New parameter with default True for backward compatibility
        include_subfolders: bool = True
    ) -> Dict[str, Any]:
        """
        Process all documents in the specified Drive folder

        Args:
            vector_store (ChromaVectorStore): Vector store instance
            include_subfolders (bool): Whether to process documents in subfolders

        Returns:
            Dict[str, Any]: Processing results
        """
        try:
            # Get documents from folder
            files = self.google_drive_service.get_folder_contents(
                self.folder_id,
                include_subfolders=include_subfolders
            )

            # Clean up orphaned documents first
            cleanup_results = await self._cleanup_orphaned_documents(files, vector_store)

            processed_files = []
            skipped_files = []
            errors = []

            for file in files:
                # Skip if it's a folder
                if file.get('mimeType') == 'application/vnd.google-apps.folder':
                    continue

                # Get file path (including folder structure if available)
                file_path = self._get_file_path(file)
                file['display_path'] = file_path

                result = await self._process_single_file(file, vector_store)

                if result['status'] == 'processed':
                    processed_files.append(result['data'])
                elif result['status'] == 'skipped':
                    skipped_files.append(result['data'])
                else:  # status == 'error'
                    errors.append(result['data'])

            # Clean up temporary directory if empty
            self._cleanup_temp_dir()

            return {
                "status": "completed",
                "processed_files": {
                    "count": len(processed_files),
                    "details": processed_files
                },
                "skipped_files": {
                    "count": len(skipped_files),
                    "details": skipped_files
                },
                "errors": {
                    "count": len(errors),
                    "details": errors
                }
            }

        except Exception as e:
            logger.error(f"Error processing Drive documents: {str(e)}")
            raise HTTPException(
                status_code=500,
                detail=f"Failed to process drive documents: {str(e)}"
            )

    def _get_file_path(self, file: Dict[str, Any]) -> str:
        """
        Get the full path for a file including its folder structure

        Args:
            file (Dict[str, Any]): File metadata

        Returns:
            str: Display path of the file
        """
        path_parts = [file['name']]

        # Add folder path if available (new structure)
        if folder_path := file.get('folder_path', []):
            for folder in reversed(folder_path):
                path_parts.insert(0, folder['name'])

        return '/'.join(path_parts)

    async def _process_single_file(
        self,
        file: Dict[str, Any],
        vector_store: ChromaVectorStore
    ) -> Dict[str, Any]:
        """Process a single Drive file"""
        mime_type = file.get('mimeType', '')

        # Skip if mime type not supported
        if mime_type not in self.supported_mime_types:
            return {
                'status': 'skipped',
                'data': {
                    'name': file['name'],
                    'path': file.get('display_path', file['name']),
                    'reason': f'Unsupported mime type: {mime_type}'
                }
            }

        try:
            document_id = file['id']
            modified_time = file.get('modifiedTime', 'N/A')

            # Check if document should be processed
            if self.save_document(document_id, vector_store, modified_time):
                # Download and process file
                temp_file_path = await self._download_and_save_file(
                    file['id'],
                    mime_type
                )

                try:
                    # Process document
                    processed_doc = await self.doc_processor.process_document(
                        str(temp_file_path)
                    )

                    # Add to vector store with path information
                    self._add_to_vector_store(
                        processed_doc['chunks'],
                        file,
                        mime_type,
                        vector_store
                    )

                    # Add MongoDB storage - Store Google Drive URL
                    await self.mongodb.store_document(
                        document_id=document_id,
                        filename=file['name'],
                        content_type=mime_type,
                        file_size=0,  # Not needed for drive documents
                        url_path=f"https://drive.google.com/file/d/{document_id}/view",
                        source="google_drive"
                    )

                    return {
                        'status': 'processed',
                        'data': {
                            'name': file['name'],
                            'path': file.get('display_path', file['name']),
                            'id': file['id'],
                            'chunks_processed': len(processed_doc['chunks'])
                        }
                    }

                finally:
                    # Clean up temporary file
                    if temp_file_path.exists():
                        temp_file_path.unlink()
            else:
                return {
                    'status': 'skipped',
                    'data': {
                        'name': file['name'],
                        'path': file.get('display_path', file['name']),
                        'reason': 'Document already exists in the memory.'
                    }
                }

        except Exception as e:
            logger.error(f"Error processing file {file['name']}: {str(e)}")
            return {
                'status': 'error',
                'data': {
                    'file_name': file['name'],
                    'path': file.get('display_path', file['name']),
                    'error': str(e)
                }
            }

    def _add_to_vector_store(
        self,
        chunks: List[str],
        file: Dict[str, Any],
        mime_type: str,
        vector_store: ChromaVectorStore
    ) -> None:
        """Add processed chunks to vector store with path information"""
        chunk_metadatas = []
        chunk_ids = []

        modified_time = file.get('modifiedTime', 'N/A')
        file_path = file.get('display_path', file['name'])

        for i, chunk in enumerate(chunks):
            chunk_id = f"{file['id']}-chunk-{i}"
            chunk_ids.append(chunk_id)
            chunk_metadatas.append({
                "source": file_path,  # Use full path instead of just name
                "document_id": file['id'],
                "chunk_index": i,
                "mime_type": mime_type,
                "modified_time": modified_time,
                "total_chunks": len(chunks),
                "file_type": self.supported_mime_types[mime_type],
                "is_google_doc": mime_type.startswith('application/vnd.google-apps')
            })

        vector_store.add_documents(
            documents=chunks,
            metadatas=chunk_metadatas,
            ids=chunk_ids
        )

    async def _download_and_save_file(
        self,
        file_id: str,
        mime_type: str
    ) -> Path:
        """Download and save file to temporary location"""
        extension = self.supported_mime_types[mime_type]
        temp_file_path = self.temp_dir / f"{file_id}{extension}"

        if mime_type in self.google_docs_export_types:
            # Download Google Doc in the specified export format
            content = self.google_drive_service.export_file(
                file_id,
                self.google_docs_export_types[mime_type]
            )
        else:
            # Download regular file
            content = self.google_drive_service.download_file(file_id)

        with open(temp_file_path, 'wb') as f:
            if isinstance(content, str):
                f.write(content.encode('utf-8'))
            else:
                f.write(content)

        return temp_file_path

    def save_document(
        self,
        document_id: str,
        vector_store: ChromaVectorStore,
        modified_date: str
    ) -> bool:
        """
        Check if document needs to be processed based on modification date

        Args:
            document_id (str): ID of the document to check
            vector_store (ChromaVectorStore): Vector store instance
            modified_date (str): Modified date to compare against

        Returns:
            bool: True if document should be processed, False otherwise
        """
        try:
            # Retrieve all chunks for the given document_id
            chunks = vector_store.get_document_chunks(document_id)

            if not chunks:
                # Document doesn't exist in vector store
                return True

            # Check the modified_time of the first chunk
            first_chunk_metadata = chunks[0].get("metadata", {})

            if first_chunk_metadata.get("modified_time") != modified_date:
                # If modified_time doesn't match, delete existing chunks
                vector_store.delete_document(document_id)
                logger.info(
                    f"Document {document_id} has been modified, will reprocess")
                return True

            logger.info(f"Document {document_id} is up to date, skipping")
            return False

        except Exception as e:
            logger.error(f"Error checking document status: {str(e)}")
            # In case of error, process the document to be safe
            return True

    def _cleanup_temp_dir(self) -> None:
        """Clean up temporary directory if empty"""
        try:
            if self.temp_dir.exists() and not any(self.temp_dir.iterdir()):
                self.temp_dir.rmdir()
        except Exception as e:
            logger.error(f"Error cleaning up temp directory: {str(e)}")
            # Don't raise the error as this is a cleanup operation