diff --git a/core/api.py b/core/api.py index 348af92..20b961c 100644 --- a/core/api.py +++ b/core/api.py @@ -13,7 +13,7 @@ import logging import arq from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from core.limits_utils import check_and_increment_limits -from core.models.request import GenerateUriRequest, RetrieveRequest, CompletionQueryRequest, IngestTextRequest, CreateGraphRequest, UpdateGraphRequest, BatchIngestResponse +from core.models.request import GenerateUriRequest, RetrieveRequest, CompletionQueryRequest, IngestTextRequest, CreateGraphRequest, UpdateGraphRequest, BatchIngestResponse, SetFolderRuleRequest from core.models.completion import ChunkSource, CompletionResponse from core.models.documents import Document, DocumentResult, ChunkResult from core.models.graph import Graph @@ -32,6 +32,7 @@ from core.storage.local_storage import LocalStorage from core.reranker.flag_reranker import FlagReranker from core.cache.llama_cache_factory import LlamaCacheFactory import tomli +from pydantic import BaseModel # Initialize FastAPI app app = FastAPI(title="Morphik API") @@ -1998,3 +1999,230 @@ async def generate_cloud_uri( except Exception as e: logger.error(f"Error generating cloud URI: {e}") raise HTTPException(status_code=500, detail=str(e)) + + +@app.post("/folders/{folder_id}/set_rule") +async def set_folder_rule( + folder_id: str, + request: SetFolderRuleRequest, + auth: AuthContext = Depends(verify_token), + apply_to_existing: bool = True, +): + """ + Set extraction rules for a folder. + + Args: + folder_id: ID of the folder to set rules for + request: SetFolderRuleRequest containing metadata extraction rules + auth: Authentication context + apply_to_existing: Whether to apply rules to existing documents in the folder + + Returns: + Success status with processing results + """ + # Import text here to ensure it's available in this function's scope + from sqlalchemy import text + try: + async with telemetry.track_operation( + operation_type="set_folder_rule", + user_id=auth.entity_id, + metadata={ + "folder_id": folder_id, + "rule_count": len(request.rules), + "apply_to_existing": apply_to_existing, + }, + ): + # Log detailed information about the rules + logger.debug(f"Setting rules for folder {folder_id}") + logger.debug(f"Number of rules: {len(request.rules)}") + + for i, rule in enumerate(request.rules): + logger.debug(f"\nRule {i + 1}:") + logger.debug(f"Type: {rule.type}") + logger.debug("Schema:") + for field_name, field_config in rule.schema.items(): + logger.debug(f" Field: {field_name}") + logger.debug(f" Type: {field_config.get('type', 'unknown')}") + logger.debug(f" Description: {field_config.get('description', 'No description')}") + if 'schema' in field_config: + logger.debug(f" Has JSON schema: Yes") + logger.debug(f" Schema: {field_config['schema']}") + + # Get the folder + folder = await document_service.db.get_folder(folder_id, auth) + if not folder: + raise HTTPException(status_code=404, detail=f"Folder {folder_id} not found") + + # Check if user has write access to the folder + if not document_service.db._check_folder_access(folder, auth, "write"): + raise HTTPException(status_code=403, detail="You don't have write access to this folder") + + # Update folder with rules + # Convert rules to dicts for JSON serialization + rules_dicts = [rule.model_dump() for rule in request.rules] + + # Update the folder in the database + async with document_service.db.async_session() as session: + # Execute update query + await session.execute( + text( + """ + UPDATE folders + SET rules = :rules + WHERE id = :folder_id + """ + ), + {"folder_id": folder_id, "rules": json.dumps(rules_dicts)}, + ) + await session.commit() + + logger.info(f"Successfully updated folder {folder_id} with {len(request.rules)} rules") + + # Get updated folder + updated_folder = await document_service.db.get_folder(folder_id, auth) + + # If apply_to_existing is True, apply these rules to all existing documents in the folder + processing_results = {"processed": 0, "errors": []} + + if apply_to_existing and folder.document_ids: + logger.info(f"Applying rules to {len(folder.document_ids)} existing documents in folder") + + # Import rules processor + from core.services.rules_processor import RulesProcessor + rules_processor = RulesProcessor() + + # Get all documents in the folder + documents = await document_service.db.get_documents_by_id(folder.document_ids, auth) + + # Process each document + for doc in documents: + try: + # Get document content + logger.info(f"Processing document {doc.external_id}") + + # For each document, apply the rules from the folder + doc_content = None + + # Get content from system_metadata if available + if doc.system_metadata and "content" in doc.system_metadata: + doc_content = doc.system_metadata["content"] + logger.info(f"Retrieved content from system_metadata for document {doc.external_id}") + + # If we still have no content, log error and continue + if not doc_content: + error_msg = f"No content found in system_metadata for document {doc.external_id}" + logger.error(error_msg) + processing_results["errors"].append({ + "document_id": doc.external_id, + "error": error_msg + }) + continue + + # Process document with rules + try: + # Convert request rules to actual rule models and apply them + from core.models.rules import MetadataExtractionRule + + for rule_request in request.rules: + if rule_request.type == "metadata_extraction": + # Create the actual rule model + rule = MetadataExtractionRule( + type=rule_request.type, + schema=rule_request.schema + ) + + # Apply the rule with retries + max_retries = 3 + base_delay = 1 # seconds + extracted_metadata = None + last_error = None + + for retry_count in range(max_retries): + try: + if retry_count > 0: + # Exponential backoff + delay = base_delay * (2 ** (retry_count - 1)) + logger.info(f"Retry {retry_count}/{max_retries} after {delay}s delay") + await asyncio.sleep(delay) + + extracted_metadata, _ = await rule.apply(doc_content) + logger.info(f"Successfully extracted metadata on attempt {retry_count + 1}: {extracted_metadata}") + break # Success, exit retry loop + + except Exception as rule_apply_error: + last_error = rule_apply_error + logger.warning(f"Metadata extraction attempt {retry_count + 1} failed: {rule_apply_error}") + if retry_count == max_retries - 1: # Last attempt + logger.error(f"All {max_retries} metadata extraction attempts failed") + processing_results["errors"].append({ + "document_id": doc.external_id, + "error": f"Failed to extract metadata after {max_retries} attempts: {str(last_error)}" + }) + continue # Skip to next document + + # Update document metadata if extraction succeeded + if extracted_metadata: + # Merge new metadata with existing + doc.metadata.update(extracted_metadata) + + # Create an updates dict that only updates metadata + # We need to create system_metadata with all preserved fields + # Note: In the database, metadata is stored as 'doc_metadata', not 'metadata' + updates = { + "doc_metadata": doc.metadata, # Use doc_metadata for the database + "system_metadata": {} # Will be merged with existing in update_document + } + + # Explicitly preserve the content field in system_metadata + if "content" in doc.system_metadata: + updates["system_metadata"]["content"] = doc.system_metadata["content"] + + # Log the updates we're making + logger.info(f"Updating document {doc.external_id} with metadata: {extracted_metadata}") + logger.info(f"Full metadata being updated: {doc.metadata}") + logger.info(f"Update object being sent to database: {updates}") + logger.info(f"Preserving content in system_metadata: {'content' in doc.system_metadata}") + + # Update document in database + success = await document_service.db.update_document( + doc.external_id, + updates, + auth + ) + + if success: + logger.info(f"Updated metadata for document {doc.external_id}") + processing_results["processed"] += 1 + else: + logger.error(f"Failed to update metadata for document {doc.external_id}") + processing_results["errors"].append({ + "document_id": doc.external_id, + "error": "Failed to update document metadata" + }) + except Exception as rule_error: + logger.error(f"Error processing rules for document {doc.external_id}: {rule_error}") + processing_results["errors"].append({ + "document_id": doc.external_id, + "error": f"Error processing rules: {str(rule_error)}" + }) + + except Exception as doc_error: + logger.error(f"Error processing document {doc.external_id}: {doc_error}") + processing_results["errors"].append({ + "document_id": doc.external_id, + "error": str(doc_error) + }) + + return { + "status": "success", + "message": "Rules set successfully", + "folder_id": folder_id, + "rules": updated_folder.rules, + "processing_results": processing_results + } + except HTTPException as e: + # Re-raise HTTP exceptions + raise + except Exception as e: + logger.error(f"Error setting folder rules: {e}") + raise HTTPException(status_code=500, detail=str(e)) diff --git a/core/database/postgres_database.py b/core/database/postgres_database.py index c1ce16e..903ca29 100644 --- a/core/database/postgres_database.py +++ b/core/database/postgres_database.py @@ -81,6 +81,7 @@ class FolderModel(Base): document_ids = Column(JSONB, default=list) system_metadata = Column(JSONB, default=dict) access_control = Column(JSONB, default=dict) + rules = Column(JSONB, default=list) # Create indexes __table_args__ = ( @@ -220,6 +221,28 @@ class PostgresDatabase(BaseDatabase): ) ) + # Add rules column to folders table if it doesn't exist + result = await conn.execute( + text( + """ + SELECT column_name + FROM information_schema.columns + WHERE table_name = 'folders' AND column_name = 'rules' + """ + ) + ) + if not result.first(): + # Add rules column to folders table + await conn.execute( + text( + """ + ALTER TABLE folders + ADD COLUMN IF NOT EXISTS rules JSONB DEFAULT '[]'::jsonb + """ + ) + ) + logger.info("Added rules column to folders table") + # Create indexes for folders table await conn.execute(text("CREATE INDEX IF NOT EXISTS idx_folder_name ON folders (name);")) await conn.execute(text("CREATE INDEX IF NOT EXISTS idx_folder_owner ON folders USING gin (owner);")) @@ -557,14 +580,17 @@ class PostgresDatabase(BaseDatabase): # Update system metadata updates.setdefault("system_metadata", {}) - # Preserve folder_name and end_user_id if not explicitly overridden + # Merge with existing system_metadata instead of just preserving specific fields if existing_doc.system_metadata: - if "folder_name" in existing_doc.system_metadata and "folder_name" not in updates["system_metadata"]: - updates["system_metadata"]["folder_name"] = existing_doc.system_metadata["folder_name"] - - if "end_user_id" in existing_doc.system_metadata and "end_user_id" not in updates["system_metadata"]: - updates["system_metadata"]["end_user_id"] = existing_doc.system_metadata["end_user_id"] + # Start with existing system_metadata + merged_system_metadata = dict(existing_doc.system_metadata) + # Update with new values + merged_system_metadata.update(updates["system_metadata"]) + # Replace with merged result + updates["system_metadata"] = merged_system_metadata + logger.debug(f"Merged system_metadata during document update, preserving existing fields") + # Always update the updated_at timestamp updates["system_metadata"]["updated_at"] = datetime.now(UTC) # Serialize datetime objects to ISO format strings @@ -577,9 +603,21 @@ class PostgresDatabase(BaseDatabase): doc_model = result.scalar_one_or_none() if doc_model: + # Log what we're updating + logger.info(f"Document update: updating fields {list(updates.keys())}") + + # Special handling for metadata/doc_metadata conversion + if "metadata" in updates and "doc_metadata" not in updates: + logger.info("Converting 'metadata' to 'doc_metadata' for database update") + updates["doc_metadata"] = updates.pop("metadata") + + # Set all attributes for key, value in updates.items(): + logger.debug(f"Setting document attribute {key} = {value}") setattr(doc_model, key, value) + await session.commit() + logger.info(f"Document {document_id} updated successfully") return True return False @@ -1108,7 +1146,8 @@ class PostgresDatabase(BaseDatabase): owner=folder_dict["owner"], document_ids=folder_dict.get("document_ids", []), system_metadata=folder_dict.get("system_metadata", {}), - access_control=access_control + access_control=access_control, + rules=folder_dict.get("rules", []) ) session.add(folder_model) @@ -1144,7 +1183,8 @@ class PostgresDatabase(BaseDatabase): "owner": folder_model.owner, "document_ids": folder_model.document_ids, "system_metadata": folder_model.system_metadata, - "access_control": folder_model.access_control + "access_control": folder_model.access_control, + "rules": folder_model.rules } folder = Folder(**folder_dict) @@ -1190,7 +1230,8 @@ class PostgresDatabase(BaseDatabase): "owner": folder_row.owner, "document_ids": folder_row.document_ids, "system_metadata": folder_row.system_metadata, - "access_control": folder_row.access_control + "access_control": folder_row.access_control, + "rules": folder_row.rules } return Folder(**folder_dict) @@ -1210,7 +1251,8 @@ class PostgresDatabase(BaseDatabase): "owner": folder_model.owner, "document_ids": folder_model.document_ids, "system_metadata": folder_model.system_metadata, - "access_control": folder_model.access_control + "access_control": folder_model.access_control, + "rules": folder_model.rules } folder = Folder(**folder_dict) @@ -1244,7 +1286,8 @@ class PostgresDatabase(BaseDatabase): "owner": folder_model.owner, "document_ids": folder_model.document_ids, "system_metadata": folder_model.system_metadata, - "access_control": folder_model.access_control + "access_control": folder_model.access_control, + "rules": folder_model.rules } folder = Folder(**folder_dict) diff --git a/core/models/folders.py b/core/models/folders.py index e17dd25..654c50c 100644 --- a/core/models/folders.py +++ b/core/models/folders.py @@ -21,6 +21,7 @@ class Folder(BaseModel): access_control: Dict[str, List[str]] = Field( default_factory=lambda: {"readers": [], "writers": [], "admins": []} ) + rules: List[Dict[str, Any]] = Field(default_factory=list) def __hash__(self): return hash(self.id) diff --git a/core/models/request.py b/core/models/request.py index 84ad4e3..3d5ec7f 100644 --- a/core/models/request.py +++ b/core/models/request.py @@ -109,3 +109,14 @@ class GenerateUriRequest(BaseModel): name: str = Field(..., description="Name of the application") user_id: str = Field(..., description="ID of the user who owns the app") expiry_days: int = Field(default=30, description="Number of days until the token expires") + + +# Add these classes before the extract_folder_data endpoint +class MetadataExtractionRuleRequest(BaseModel): + """Request model for metadata extraction rule""" + type: str = "metadata_extraction" # Only metadata_extraction supported for now + schema: Dict[str, Any] + +class SetFolderRuleRequest(BaseModel): + """Request model for setting folder rules""" + rules: List[MetadataExtractionRuleRequest] \ No newline at end of file diff --git a/core/models/rules.py b/core/models/rules.py index c10ba97..71ec381 100644 --- a/core/models/rules.py +++ b/core/models/rules.py @@ -74,14 +74,28 @@ class MetadataExtractionRule(BaseRule): # Create the dynamic model DynamicMetadataModel = create_model("DynamicMetadataModel", **field_definitions) + # Create a more explicit instruction that clearly shows expected output format + schema_descriptions = [] + for field_name, field_config in self.schema.items(): + field_type = field_config.get("type", "string") if isinstance(field_config, dict) else "string" + description = field_config.get("description", "No description") if isinstance(field_config, dict) else field_config + schema_descriptions.append(f"- {field_name}: {description} (type: {field_type})") + + schema_text = "\n".join(schema_descriptions) + prompt = f""" Extract metadata from the following text according to this schema: - {self.schema} - + + {schema_text} + Text to extract from: {content} - - Extract all relevant information that matches the schema. + + Follow these guidelines: + 1. Extract all requested information as simple strings, numbers, or booleans (not as objects or nested structures) + 2. If information is not present, indicate this with null instead of making something up + 3. Answer directly with the requested information - don't include explanations or reasoning + 4. Be concise but accurate in your extractions """ # Get the model configuration from registered_models @@ -93,7 +107,7 @@ class MetadataExtractionRule(BaseRule): system_message = { "role": "system", - "content": "You are a metadata extraction assistant. Extract structured metadata from text precisely following the provided schema.", + "content": "You are a metadata extraction assistant. Extract structured metadata from text precisely following the provided schema. Always return the metadata as direct values (strings, numbers, booleans), not as objects with additional properties.", } user_message = {"role": "user", "content": prompt} diff --git a/morphik.toml b/morphik.toml index c7cffcd..7eeae8c 100644 --- a/morphik.toml +++ b/morphik.toml @@ -15,6 +15,7 @@ dev_permissions = ["read", "write", "admin"] # Default dev permissions # OpenAI models openai_gpt4o = { model_name = "gpt-4o", vision = true } openai_gpt4 = { model_name = "gpt-4" } +openai_gpt4o_extraction = { model_name = "gpt-4o" } # Azure OpenAI models azure_gpt4 = { model_name = "gpt-4", api_base = "YOUR_AZURE_URL_HERE", api_version = "2023-05-15", deployment_id = "gpt-4-deployment" } diff --git a/ui-component/app/page.tsx b/ui-component/app/page.tsx index dea0a7f..8594b85 100644 --- a/ui-component/app/page.tsx +++ b/ui-component/app/page.tsx @@ -1,8 +1,20 @@ "use client"; -import React from 'react'; +import React, { Suspense } from 'react'; import MorphikUI from '@/components/MorphikUI'; +import { useSearchParams } from 'next/navigation'; + +function HomeContent() { + const searchParams = useSearchParams(); + const folderParam = searchParams.get('folder'); + + return ; +} export default function Home() { - return ; + return ( + Loading...}> + + + ); } \ No newline at end of file diff --git a/ui-component/components/MorphikUI.tsx b/ui-component/components/MorphikUI.tsx index ce40ae6..cf996bb 100644 --- a/ui-component/components/MorphikUI.tsx +++ b/ui-component/components/MorphikUI.tsx @@ -21,7 +21,8 @@ const MorphikUI: React.FC = ({ apiBaseUrl = DEFAULT_API_BASE_URL, isReadOnlyUri = false, // Default to editable URI onUriChange, - onBackClick + onBackClick, + initialFolder = null }) => { // State to manage connectionUri internally if needed const [currentUri, setCurrentUri] = useState(connectionUri); @@ -41,6 +42,7 @@ const MorphikUI: React.FC = ({ }; const [activeSection, setActiveSection] = useState('documents'); const [selectedGraphName, setSelectedGraphName] = useState(undefined); + const [isSidebarCollapsed, setIsSidebarCollapsed] = useState(false); // Extract auth token and API URL from connection URI if provided const authToken = currentUri ? extractTokenFromUri(currentUri) : null; @@ -65,6 +67,8 @@ const MorphikUI: React.FC = ({ connectionUri={currentUri} isReadOnlyUri={isReadOnlyUri} onUriChange={handleUriChange} + isCollapsed={isSidebarCollapsed} + setIsCollapsed={setIsSidebarCollapsed} />
@@ -88,7 +92,9 @@ const MorphikUI: React.FC = ({ {activeSection === 'documents' && ( )} diff --git a/ui-component/components/documents/DocumentDetail.tsx b/ui-component/components/documents/DocumentDetail.tsx index 20816e6..b5b0e5d 100644 --- a/ui-component/components/documents/DocumentDetail.tsx +++ b/ui-component/components/documents/DocumentDetail.tsx @@ -6,8 +6,9 @@ import { ScrollArea } from '@/components/ui/scroll-area'; import { Button } from '@/components/ui/button'; import { Accordion, AccordionContent, AccordionItem, AccordionTrigger } from '@/components/ui/accordion'; import { Dialog, DialogContent, DialogDescription, DialogFooter, DialogHeader, DialogTitle, DialogTrigger } from '@/components/ui/dialog'; -import { Info, Folder as FolderIcon } from 'lucide-react'; +import { Info } from 'lucide-react'; import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue } from "@/components/ui/select"; +import Image from 'next/image'; import { Document, Folder } from '@/components/types'; @@ -20,6 +21,7 @@ interface DocumentDetailProps { refreshDocuments: () => void; refreshFolders: () => void; loading: boolean; + onClose: () => void; } const DocumentDetail: React.FC = ({ @@ -30,7 +32,8 @@ const DocumentDetail: React.FC = ({ authToken, refreshDocuments, refreshFolders, - loading + loading, + onClose }) => { const [isMovingToFolder, setIsMovingToFolder] = useState(false); @@ -103,8 +106,20 @@ const DocumentDetail: React.FC = ({ return (
-
+

Document Details

+
@@ -122,7 +137,7 @@ const DocumentDetail: React.FC = ({

Folder

- + Folder setLocalColumnName(e.target.value)} + autoFocus + /> +
+
+ + +
+ {localColumnType === 'json' && ( +
+ +
+ + + + {localColumnSchema ? 'Schema loaded' : 'No schema uploaded'} + +
+
+ )} +
+ +