morphik-core/evaluations/Science graphs (SciER)/scier_evaluation.py

#!/usr/bin/env python3
"""
SciER Graph Creation Script for Morphik

This script creates a knowledge graph from the SciER dataset.
It ingests the documents and creates a graph using custom prompt overrides.
"""

import argparse
import uuid
from collections import defaultdict
from typing import Dict, List, Tuple

# Import SciER data loader
from data_loader import load_jsonl
from dotenv import load_dotenv
from morphik import Morphik
from morphik.models import EntityExtractionExample, EntityExtractionPromptOverride, GraphPromptOverrides
from tqdm import tqdm

# Load environment variables
load_dotenv()


def setup_morphik_client() -> Morphik:
    """Initialize and return a Morphik client."""
    # Connect to Morphik (adjust parameters as needed)
    return Morphik(timeout=300000, is_local=True)


def load_scier_data(dataset_path: str = "test.jsonl", limit: int = None) -> List[Dict]:
    """
    Load SciER dataset from the specified JSONL file.

    Args:
        dataset_path: Path to the JSONL file
        limit: Maximum number of records to load (None for all)

    Returns:
        List of dataset records
    """
    data = load_jsonl(dataset_path)
    if limit:
        data = data[:limit]
    return data


def prepare_text_for_ingestion(records: List[Dict]) -> List[Dict]:
    """
    Prepare SciER records for ingestion into Morphik.

    Args:
        records: List of SciER records

    Returns:
        List of dictionaries ready for ingestion
    """
    documents = []

    # Group records by doc_id to create complete documents
    doc_groups = defaultdict(list)
    for record in records:
        doc_groups[record["doc_id"]].append(record)

    # Convert grouped records to documents
    for doc_id, records in doc_groups.items():
        text = "\n".join([record["sentence"] for record in records])

        # Collect all entities and relations for ground truth
        all_entities = []
        all_relations = []
        for record in records:
            all_entities.extend(record["ner"])
            all_relations.extend(record["rel"])

        documents.append(
            {
                "text": text,
                "metadata": {
                    "doc_id": doc_id,
                    "ground_truth_entities": all_entities,
                    "ground_truth_relations": all_relations,
                },
            }
        )

    return documents


def create_graph_extraction_override(entity_types: List[str]) -> EntityExtractionPromptOverride:
    """
    Create graph extraction prompt override with examples for both entities and relations.

    Args:
        entity_types: List of entity types (Dataset, Method, Task)

    Returns:
        EntityExtractionPromptOverride object
    """
    examples = []

    if "Dataset" in entity_types:
        examples.extend(
            [
                EntityExtractionExample(label="ImageNet", type="Dataset"),
                EntityExtractionExample(label="CIFAR-10", type="Dataset"),
                EntityExtractionExample(label="MNIST", type="Dataset"),
                EntityExtractionExample(label="Penn TreeBank", type="Dataset"),
                EntityExtractionExample(label="SQuAD", type="Dataset"),
                EntityExtractionExample(label="MultiNLI", type="Dataset"),
            ]
        )

    if "Method" in entity_types:
        examples.extend(
            [
                # General models
                EntityExtractionExample(label="Convolutional Neural Network", type="Method"),
                EntityExtractionExample(label="Random Forest", type="Method"),
                # Architecture-specific models from SciER
                EntityExtractionExample(label="BERT", type="Method"),
                EntityExtractionExample(label="Transformer", type="Method"),
                EntityExtractionExample(label="LSTM", type="Method"),
                EntityExtractionExample(label="Bidirectional LSTM", type="Method"),
                EntityExtractionExample(label="self-attentive models", type="Method"),
                EntityExtractionExample(label="seq2seq", type="Method"),
                # Components
                EntityExtractionExample(label="attention mechanism", type="Method"),
                EntityExtractionExample(label="feature extraction mechanisms", type="Method"),
            ]
        )

    if "Task" in entity_types:
        examples.extend(
            [
                # General tasks
                EntityExtractionExample(label="Image Classification", type="Task"),
                EntityExtractionExample(label="Named Entity Recognition", type="Task"),
                # NLP tasks from SciER
                EntityExtractionExample(label="Machine Translation", type="Task"),
                EntityExtractionExample(label="neural machine translation", type="Task"),
                EntityExtractionExample(label="sentiment analysis", type="Task"),
                EntityExtractionExample(label="entailment", type="Task"),
                EntityExtractionExample(label="text classification", type="Task"),
                EntityExtractionExample(label="natural language processing", type="Task"),
                EntityExtractionExample(label="sequence-to-sequence problems", type="Task"),
                EntityExtractionExample(label="NLP", type="Task"),
            ]
        )

    # Simplest version - bare standard placeholders
    prompt_template = """
Your task is to carefully read the following scientific text and extract specific information.
You need to extract:
1.  **Entities:** Identify any mentions of Datasets, Methods, and Tasks. Use the entity examples provided below to understand what to look for, that is a very small list, there are many many entities.
2.  **Relationships:** Identify relationships *between the extracted entities* based on the information stated in the text. Use only the relationship types defined below.

**Entity Examples (this is a very brief list, there are many many entities):**
{examples}

**Relationship Information:**
Desired Relationship Types (only extract these relationships, nothing else, there are a lot of relationships, be nuanced and careful, think hard about how entities relate to each other):
- Used-For: [Method/Dataset] is used for [Task]
- Feature-Of: [Feature] is a feature of [Method/Task]
- Hyponym-Of: [Specific] is a type of [General]
- Part-Of: [Component] is part of [System]
- Compare: [Entity A] is compared to [Entity B]
- Evaluate-For: [Method] is evaluated for [Metric/Task]
- Conjunction: [Entity A] is mentioned together with [Entity B] without a specific relation
- Evaluate-On: [Method] is evaluated on [Dataset]
- Synonym-Of: [Entity A] is the same as [Entity B]

**Instructions:**
- Extract entities first, identifying their label (the text mention) and type (Dataset, Method, or Task).
- Then, extract relationships between the entities you found. The 'source' and 'target' of the relationship MUST be the exact entity labels you extracted.
- Only extract information explicitly mentioned in the text. Do not infer or add outside knowledge.
- Format your entire output as a single JSON object containing two keys: "entities" (a list of entity objects) and "relationships" (a list of relationship objects).

**Text to analyze:**
{content}
"""

    return EntityExtractionPromptOverride(prompt_template=prompt_template, examples=examples)


def create_graph(db: Morphik, documents: List[Dict], model_name: str, run_id: str) -> Tuple[List[str], Dict]:
    """
    Create a knowledge graph from the documents.

    Args:
        db: Morphik client
        documents: List of documents to ingest
        model_name: Name of the model being used (for tracking)
        run_id: Unique identifier for this run

    Returns:
        Tuple of (list of document IDs, graphs dict)
    """
    print(f"\n=== Creating graph with {model_name} model ===")

    # Ingest documents
    doc_ids = []
    for doc in tqdm(documents, desc="Ingesting documents"):
        # Add metadata for tracking
        doc["metadata"]["evaluation_run_id"] = run_id
        doc["metadata"]["model"] = model_name

        # Ingest the document
        result = db.ingest_text(doc["text"], metadata=doc["metadata"])
        doc_ids.append(result.external_id)

    # Create graph extraction override (which includes both entity and relationship instructions)
    entity_extraction_override = create_graph_extraction_override(["Dataset", "Method", "Task"])

    # Wrap the combined override correctly for the API
    graph_overrides = GraphPromptOverrides(entity_extraction=entity_extraction_override)

    # Create a knowledge graph with overrides
    print("Creating knowledge graph with prompt overrides...")
    graph = db.create_graph(name=f"scier_{model_name}_{run_id}", documents=doc_ids, prompt_overrides=graph_overrides)

    print(f"Created graph with {len(graph.entities)} entities and {len(graph.relationships)} relationships")

    return doc_ids, {"graph": graph}


def main():
    """Main function to create a graph from the SciER dataset."""
    parser = argparse.ArgumentParser(description="SciER Graph Creation Script for Morphik")
    parser.add_argument("--limit", type=int, default=57, help="Maximum number of documents to process (default: 57)")
    parser.add_argument("--run-id", type=str, default=None, help="Unique run identifier (default: auto-generated)")
    parser.add_argument(
        "--model-name",
        type=str,
        default=None,
        help="Name of the currently configured model (default: auto-detected)",
    )

    args = parser.parse_args()

    # Generate run ID if not provided
    run_id = args.run_id or str(uuid.uuid4())[:8]

    # Auto-detect or use provided model name
    model_name = args.model_name or "default_model"

    print(f"Running graph creation for model: {model_name}")
    print(f"Run ID: {run_id}")

    # Initialize Morphik client
    db = setup_morphik_client()

    # Load SciER dataset
    scier_data = load_scier_data("test.jsonl", limit=args.limit)
    print(f"Loaded {len(scier_data)} records")

    # Prepare documents for ingestion
    documents = prepare_text_for_ingestion(scier_data)
    print(f"Prepared {len(documents)} documents for ingestion")

    # Create the graph
    doc_ids, graphs = create_graph(db, documents, model_name, run_id)

    # Print graph name for evaluation
    graph_name = f"scier_{model_name}_{run_id}"
    print(f"\nGraph creation complete! Created graph: {graph_name}")
    print(f"To evaluate this graph, run: python evaluate_result.py --graph-name {graph_name}")


if __name__ == "__main__":
    main()