morphik-core/core/tools/document_tools.py

"""Document retrieval and management tools."""

import json
import logging
from datetime import UTC, datetime
from typing import Any, Dict, List, Literal, Optional

from core.models.auth import AuthContext
from core.services.document_service import DocumentService

logger = logging.getLogger(__name__)


class ToolError(Exception):
    """Exception raised when a tool execution fails."""

    pass


async def retrieve_chunks(
    query: str,
    k: int = 5,
    filters: Optional[Dict[str, Any]] = None,
    min_relevance: float = 0.7,
    use_colpali: bool = True,
    folder_name: Optional[str] = None,
    end_user_id: Optional[str] = None,
    document_service: DocumentService = None,
    auth: AuthContext = None,
) -> List[Dict[str, Any]]:
    """
    Retrieve the most relevant text and image chunks from the knowledge base.

    Args:
        query: The search query or question
        k: Number of chunks to retrieve (default: 5)
        filters: Metadata filters to narrow results
        min_relevance: Minimum relevance score threshold (0-1)
        use_colpali: Whether to use multimodal features
        folder_name: Optional folder to scope the search to
        end_user_id: Optional end-user ID to scope the search to
        document_service: DocumentService instance
        auth: Authentication context

    Returns:
        List of content items with text and images
    """
    if document_service is None:
        raise ToolError("Document service not provided")

    try:
        # Directly await the document service method
        chunks = await document_service.retrieve_chunks(
            query=query,
            auth=auth,
            filters=filters,
            k=k,
            min_score=min_relevance,
            use_colpali=use_colpali,
            folder_name=folder_name,
            end_user_id=end_user_id,
        )

        # Format the results for LiteLLM tool response
        content = []

        # Add a header text element
        content.append({"type": "text", "text": f"Found {len(chunks)} relevant chunks:"})

        for chunk in chunks:
            # Check if this is an image chunk
            if chunk.metadata.get("is_image", False):
                # Add image to content
                if chunk.content.startswith("data:"):
                    # Already in data URL format
                    content.append({"type": "image_url", "image_url": {"url": chunk.content}})
                else:
                    # Assuming it's base64, convert to data URL format
                    # TODO: potential bug here, if the base64 image is not a png
                    content.append(
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{chunk.content}"}}
                    )
            else:
                # Add text content with metadata
                text = f"Document: {chunk.filename or 'Unnamed'} (Score: {chunk.score:.2f})\n\n{chunk.content}"
                content.append(
                    {
                        "type": "text",
                        "text": text,
                    }
                )
        return content
    except Exception as e:
        raise ToolError(f"Error retrieving chunks: {str(e)}")


async def retrieve_document(
    document_id: str,
    format: Optional[Literal["text", "metadata"]] = "text",
    document_service: DocumentService = None,
    auth: AuthContext = None,
    end_user_id: Optional[str] = None,
) -> str:
    """
    Retrieve full content of a specific document.

    Args:
        document_id: ID of the document to retrieve
        format: Desired format of the returned document
        document_service: DocumentService instance
        auth: Authentication context
        end_user_id: Optional end-user ID to retrieve as

    Returns:
        Document content or metadata as a string
    """
    if document_service is None:
        raise ToolError("Document service not provided")

    try:
        # Directly await the document service method
        doc = await document_service.batch_retrieve_documents(
            document_ids=[document_id], auth=auth, end_user_id=end_user_id
        )

        if not doc or len(doc) == 0:
            return f"Document {document_id} not found or not accessible"

        doc = doc[0]  # Get the first document from the list

        if format == "text":
            return doc.system_metadata.get("content", "No content available")
        else:
            # Return both user-defined metadata and system metadata separately
            result: Dict[str, Any] = {}
            # User metadata
            if hasattr(doc, "metadata") and doc.metadata:
                result["metadata"] = doc.metadata
            # System metadata without content field
            if hasattr(doc, "system_metadata") and doc.system_metadata:
                system_metadata = doc.system_metadata.copy()
                if "content" in system_metadata:
                    del system_metadata["content"]
                result["system_metadata"] = system_metadata
            return json.dumps(result, indent=2, default=str)

    except Exception as e:
        raise ToolError(f"Error retrieving document: {str(e)}")


async def save_to_memory(
    content: str,
    memory_type: Literal["session", "long_term", "research_thread"],
    tags: Optional[List[str]] = None,
    document_service: DocumentService = None,
    auth: AuthContext = None,
    end_user_id: Optional[str] = None,
) -> str:
    """
    Save important information to persistent memory.

    Args:
        content: Content to save
        memory_type: Type of memory to save to
        tags: Tags for categorizing the memory
        document_service: DocumentService instance
        auth: Authentication context
        end_user_id: Optional end-user ID to save as

    Returns:
        Save operation result as a string
    """
    if document_service is None:
        raise ToolError("Document service not provided")

    try:
        # Create metadata for the saved memory
        metadata = {"memory_type": memory_type, "source": "agent"}

        if tags:
            metadata["tags"] = tags

        # Use document service to ingest the content as a document
        timestamp = await get_timestamp()
        result = await document_service.ingest_text(
            content=content,
            filename=f"memory_{memory_type}_{timestamp}",
            metadata=metadata,
            auth=auth,
            end_user_id=end_user_id,
        )

        return json.dumps({"success": True, "memory_id": result.external_id, "memory_type": memory_type})
    except Exception as e:
        raise ToolError(f"Error saving to memory: {str(e)}")


async def list_documents(
    filters: Optional[Dict[str, Any]] = None,
    skip: int = 0,
    limit: int = 100,
    folder_name: Optional[str] = None,
    end_user_id: Optional[str] = None,
    document_service: DocumentService = None,
    auth: AuthContext = None,
) -> str:
    """
    List accessible documents, showing their IDs and filenames.

    Args:
        filters: Optional metadata filters
        skip: Number of documents to skip (default: 0)
        limit: Maximum number of documents to return (default: 100)
        folder_name: Optional folder to scope the listing to
        end_user_id: Optional end-user ID to scope the listing to
        document_service: DocumentService instance
        auth: Authentication context

    Returns:
        JSON string list of documents with id and filename
    """
    if document_service is None:
        raise ToolError("Document service not provided")

    try:
        # Create system filters for folder and user scoping
        system_filters = {}
        if folder_name:
            system_filters["folder_name"] = folder_name
        if end_user_id:
            system_filters["end_user_id"] = end_user_id

        # Retrieve documents from the database
        docs = await document_service.db.get_documents(
            auth=auth, skip=skip, limit=limit, filters=filters, system_filters=system_filters
        )

        # Format the results to only include ID and filename
        formatted_docs = [{"id": doc.external_id, "filename": doc.filename} for doc in docs]

        return json.dumps({"count": len(formatted_docs), "documents": formatted_docs}, indent=2)

    except PermissionError as e:
        # Re-raise PermissionError as ToolError for consistent handling
        raise ToolError(str(e))
    except Exception as e:
        raise ToolError(f"Error listing documents: {str(e)}")


async def get_timestamp() -> str:
    """Get current timestamp in ISO format."""
    return datetime.now(UTC).isoformat().replace(":", "-").replace(".", "-")
Deep research (#126) 2025-05-01 17:02:22 -07:00			`"""Document retrieval and management tools."""`

			`import json`
			`import logging`
			`from datetime import UTC, datetime`
			`from typing import Any, Dict, List, Literal, Optional`

			`from core.models.auth import AuthContext`
			`from core.services.document_service import DocumentService`

			`logger = logging.getLogger(__name__)`


			`class ToolError(Exception):`
			`"""Exception raised when a tool execution fails."""`

			`pass`


			`async def retrieve_chunks(`
			`query: str,`
			`k: int = 5,`
			`filters: Optional[Dict[str, Any]] = None,`
			`min_relevance: float = 0.7,`
			`use_colpali: bool = True,`
			`folder_name: Optional[str] = None,`
			`end_user_id: Optional[str] = None,`
			`document_service: DocumentService = None,`
			`auth: AuthContext = None,`
			`) -> List[Dict[str, Any]]:`
			`"""`
			`Retrieve the most relevant text and image chunks from the knowledge base.`

			`Args:`
			`query: The search query or question`
			`k: Number of chunks to retrieve (default: 5)`
			`filters: Metadata filters to narrow results`
			`min_relevance: Minimum relevance score threshold (0-1)`
			`use_colpali: Whether to use multimodal features`
			`folder_name: Optional folder to scope the search to`
			`end_user_id: Optional end-user ID to scope the search to`
			`document_service: DocumentService instance`
			`auth: Authentication context`

			`Returns:`
			`List of content items with text and images`
			`"""`
			`if document_service is None:`
			`raise ToolError("Document service not provided")`

			`try:`
			`# Directly await the document service method`
			`chunks = await document_service.retrieve_chunks(`
			`query=query,`
			`auth=auth,`
			`filters=filters,`
			`k=k,`
			`min_score=min_relevance,`
			`use_colpali=use_colpali,`
			`folder_name=folder_name,`
			`end_user_id=end_user_id,`
			`)`

			`# Format the results for LiteLLM tool response`
			`content = []`

			`# Add a header text element`
			`content.append({"type": "text", "text": f"Found {len(chunks)} relevant chunks:"})`

			`for chunk in chunks:`
			`# Check if this is an image chunk`
			`if chunk.metadata.get("is_image", False):`
			`# Add image to content`
			`if chunk.content.startswith("data:"):`
			`# Already in data URL format`
			`content.append({"type": "image_url", "image_url": {"url": chunk.content}})`
			`else:`
			`# Assuming it's base64, convert to data URL format`
			`# TODO: potential bug here, if the base64 image is not a png`
			`content.append(`
			`{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{chunk.content}"}}`
			`)`
			`else:`
			`# Add text content with metadata`
			`text = f"Document: {chunk.filename or 'Unnamed'} (Score: {chunk.score:.2f})\n\n{chunk.content}"`
			`content.append(`
			`{`
			`"type": "text",`
			`"text": text,`
			`}`
			`)`
			`return content`
			`except Exception as e:`
			`raise ToolError(f"Error retrieving chunks: {str(e)}")`


			`async def retrieve_document(`
			`document_id: str,`
			`format: Optional[Literal["text", "metadata"]] = "text",`
			`document_service: DocumentService = None,`
			`auth: AuthContext = None,`
			`end_user_id: Optional[str] = None,`
			`) -> str:`
			`"""`
			`Retrieve full content of a specific document.`

			`Args:`
			`document_id: ID of the document to retrieve`
			`format: Desired format of the returned document`
			`document_service: DocumentService instance`
			`auth: Authentication context`
			`end_user_id: Optional end-user ID to retrieve as`

			`Returns:`
			`Document content or metadata as a string`
			`"""`
			`if document_service is None:`
			`raise ToolError("Document service not provided")`

			`try:`
			`# Directly await the document service method`
			`doc = await document_service.batch_retrieve_documents(`
			`document_ids=[document_id], auth=auth, end_user_id=end_user_id`
			`)`

			`if not doc or len(doc) == 0:`
			`return f"Document {document_id} not found or not accessible"`

			`doc = doc[0] # Get the first document from the list`

			`if format == "text":`
			`return doc.system_metadata.get("content", "No content available")`
			`else:`
			`# Return both user-defined metadata and system metadata separately`
			`result: Dict[str, Any] = {}`
			`# User metadata`
			`if hasattr(doc, "metadata") and doc.metadata:`
			`result["metadata"] = doc.metadata`
			`# System metadata without content field`
			`if hasattr(doc, "system_metadata") and doc.system_metadata:`
			`system_metadata = doc.system_metadata.copy()`
			`if "content" in system_metadata:`
			`del system_metadata["content"]`
			`result["system_metadata"] = system_metadata`
			`return json.dumps(result, indent=2, default=str)`

			`except Exception as e:`
			`raise ToolError(f"Error retrieving document: {str(e)}")`


			`async def save_to_memory(`
			`content: str,`
			`memory_type: Literal["session", "long_term", "research_thread"],`
			`tags: Optional[List[str]] = None,`
			`document_service: DocumentService = None,`
			`auth: AuthContext = None,`
			`end_user_id: Optional[str] = None,`
			`) -> str:`
			`"""`
			`Save important information to persistent memory.`

			`Args:`
			`content: Content to save`
			`memory_type: Type of memory to save to`
			`tags: Tags for categorizing the memory`
			`document_service: DocumentService instance`
			`auth: Authentication context`
			`end_user_id: Optional end-user ID to save as`

			`Returns:`
			`Save operation result as a string`
			`"""`
			`if document_service is None:`
			`raise ToolError("Document service not provided")`

			`try:`
			`# Create metadata for the saved memory`
			`metadata = {"memory_type": memory_type, "source": "agent"}`

			`if tags:`
			`metadata["tags"] = tags`

			`# Use document service to ingest the content as a document`
			`timestamp = await get_timestamp()`
			`result = await document_service.ingest_text(`
			`content=content,`
			`filename=f"memory_{memory_type}_{timestamp}",`
			`metadata=metadata,`
			`auth=auth,`
			`end_user_id=end_user_id,`
			`)`

			`return json.dumps({"success": True, "memory_id": result.external_id, "memory_type": memory_type})`
			`except Exception as e:`
			`raise ToolError(f"Error saving to memory: {str(e)}")`


			`async def list_documents(`
			`filters: Optional[Dict[str, Any]] = None,`
			`skip: int = 0,`
			`limit: int = 100,`
			`folder_name: Optional[str] = None,`
			`end_user_id: Optional[str] = None,`
			`document_service: DocumentService = None,`
			`auth: AuthContext = None,`
			`) -> str:`
			`"""`
			`List accessible documents, showing their IDs and filenames.`

			`Args:`
			`filters: Optional metadata filters`
			`skip: Number of documents to skip (default: 0)`
			`limit: Maximum number of documents to return (default: 100)`
			`folder_name: Optional folder to scope the listing to`
			`end_user_id: Optional end-user ID to scope the listing to`
			`document_service: DocumentService instance`
			`auth: Authentication context`

			`Returns:`
			`JSON string list of documents with id and filename`
			`"""`
			`if document_service is None:`
			`raise ToolError("Document service not provided")`

			`try:`
			`# Create system filters for folder and user scoping`
			`system_filters = {}`
			`if folder_name:`
			`system_filters["folder_name"] = folder_name`
			`if end_user_id:`
			`system_filters["end_user_id"] = end_user_id`

			`# Retrieve documents from the database`
			`docs = await document_service.db.get_documents(`
			`auth=auth, skip=skip, limit=limit, filters=filters, system_filters=system_filters`
			`)`

			`# Format the results to only include ID and filename`
			`formatted_docs = [{"id": doc.external_id, "filename": doc.filename} for doc in docs]`

			`return json.dumps({"count": len(formatted_docs), "documents": formatted_docs}, indent=2)`

			`except PermissionError as e:`
			`# Re-raise PermissionError as ToolError for consistent handling`
			`raise ToolError(str(e))`
			`except Exception as e:`
			`raise ToolError(f"Error listing documents: {str(e)}")`


			`async def get_timestamp() -> str:`
			`"""Get current timestamp in ISO format."""`
			`return datetime.now(UTC).isoformat().replace(":", "-").replace(".", "-")`