mirror of
https://github.com/james-m-jordan/morphik-core.git
synced 2025-05-09 19:32:38 +00:00
250 lines
8.6 KiB
Python
250 lines
8.6 KiB
Python
import argparse
|
|
import sys
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
from datasets import Dataset, load_dataset
|
|
from dotenv import load_dotenv
|
|
from ragas import evaluate
|
|
from ragas.metrics import answer_correctness, context_precision, faithfulness
|
|
from tqdm import tqdm
|
|
|
|
# Add the SDK path to the Python path
|
|
sdk_path = str(Path(__file__).parent.parent / "sdks" / "python")
|
|
sys.path.insert(0, sdk_path)
|
|
|
|
# Import Morphik after adding the SDK path
|
|
from morphik import Morphik # noqa: E402
|
|
|
|
# Load environment variables
|
|
load_dotenv()
|
|
|
|
# Connect to Morphik
|
|
db = Morphik(timeout=10000, is_local=True)
|
|
|
|
|
|
# Generate a run identifier
|
|
def generate_run_id():
|
|
"""Generate a unique run identifier"""
|
|
return str(uuid.uuid4())
|
|
|
|
|
|
def load_hotpotqa_dataset(num_samples=10, split="validation"):
|
|
"""Load and prepare the HotpotQA dataset"""
|
|
dataset = load_dataset("hotpot_qa", "distractor", split=split, trust_remote_code=True)
|
|
|
|
# Sample a subset
|
|
dataset = dataset.select(range(min(num_samples, len(dataset))))
|
|
|
|
return dataset
|
|
|
|
|
|
def process_with_morphik(dataset, run_id=None):
|
|
"""
|
|
Process dataset with Morphik and prepare data for RAGAS evaluation
|
|
|
|
Args:
|
|
dataset: The dataset to process
|
|
run_id: Unique identifier for this evaluation run
|
|
"""
|
|
# Generate a run_id if not provided
|
|
if run_id is None:
|
|
run_id = generate_run_id()
|
|
|
|
print(f"Using run identifier: {run_id}")
|
|
|
|
data_samples = {
|
|
"question": [],
|
|
"answer": [],
|
|
"contexts": [],
|
|
"ground_truth": [],
|
|
"run_id": [], # Store run_id for each sample
|
|
}
|
|
|
|
for i, item in enumerate(tqdm(dataset, desc="Processing documents")):
|
|
try:
|
|
# Extract question and ground truth
|
|
question = item["question"].strip()
|
|
ground_truth = item["answer"].strip()
|
|
|
|
if not question or not ground_truth:
|
|
print(f"Skipping item {i}: Empty question or answer")
|
|
continue
|
|
|
|
# Ingest the document's context into Morphik
|
|
context = ""
|
|
for title, sentences in zip(item["context"]["title"], item["context"]["sentences"]):
|
|
paragraph = " ".join(sentences)
|
|
context += f"{title}:\n{paragraph}\n\n"
|
|
|
|
# Handle a potentially longer context
|
|
# if len(context) > 10000:
|
|
# print(f"Warning: Long context ({len(context)} chars), truncating...")
|
|
# context = context[:10000]
|
|
|
|
# Ingest text with run_id in metadata
|
|
db.ingest_text(
|
|
context,
|
|
metadata={
|
|
"source": "hotpotqa",
|
|
"question_id": item.get("_id", ""),
|
|
"item_index": i,
|
|
"evaluation_run_id": run_id, # Add run_id to metadata
|
|
},
|
|
use_colpali=False,
|
|
)
|
|
|
|
# Query Morphik for the answer with concise prompt override
|
|
prompt_override = {
|
|
"query": {
|
|
"prompt_template": "Answer the following question based on the provided context. Your answer should be as concise as possible. If a yes/no answer is appropriate, just respond with 'Yes' or 'No'. Do not provide explanations or additional context unless absolutely necessary.\n\nQuestion: {question}\n\nContext: {context}"
|
|
}
|
|
}
|
|
response = db.query(
|
|
question,
|
|
use_colpali=False,
|
|
k=10,
|
|
filters={"evaluation_run_id": run_id},
|
|
prompt_overrides=prompt_override,
|
|
)
|
|
answer = response.completion
|
|
|
|
if not answer:
|
|
print(f"Warning: Empty answer for question: {question[:50]}...")
|
|
answer = "No answer provided"
|
|
|
|
# Get retrieved chunks for context with filter by run_id
|
|
chunks = db.retrieve_chunks(query=question, k=10, filters={"evaluation_run_id": run_id}) # Filter by run_id
|
|
context_texts = [chunk.content for chunk in chunks]
|
|
|
|
if not context_texts:
|
|
print(f"Warning: No contexts retrieved for question: {question[:50]}...")
|
|
context_texts = ["No context retrieved"]
|
|
|
|
# Add to our dataset
|
|
data_samples["question"].append(question)
|
|
data_samples["answer"].append(answer)
|
|
data_samples["contexts"].append(context_texts)
|
|
data_samples["ground_truth"].append(ground_truth)
|
|
data_samples["run_id"].append(run_id)
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
|
|
print(f"Error processing item {i}:")
|
|
print(f"Question: {item.get('question', 'N/A')[:50]}...")
|
|
print(f"Error: {e}")
|
|
traceback.print_exc()
|
|
continue
|
|
|
|
return data_samples, run_id
|
|
|
|
|
|
def run_evaluation(num_samples=5, output_file="ragas_results.csv", run_id=None):
|
|
"""
|
|
Run the full evaluation pipeline
|
|
|
|
Args:
|
|
num_samples: Number of samples to use from the dataset
|
|
output_file: Path to save the results CSV
|
|
run_id: Optional run identifier. If None, a new one will be generated
|
|
"""
|
|
try:
|
|
# Load dataset
|
|
print("Loading HotpotQA dataset...")
|
|
hotpot_dataset = load_hotpotqa_dataset(num_samples=num_samples)
|
|
print(f"Loaded {len(hotpot_dataset)} samples from HotpotQA")
|
|
|
|
# Process with Morphik
|
|
print("Processing with Morphik...")
|
|
data_samples, run_id = process_with_morphik(hotpot_dataset, run_id=run_id)
|
|
|
|
# Check if we have enough samples
|
|
if len(data_samples["question"]) == 0:
|
|
print("Error: No samples were successfully processed. Exiting.")
|
|
return
|
|
|
|
print(f"Successfully processed {len(data_samples['question'])} samples")
|
|
|
|
# Convert to RAGAS format
|
|
ragas_dataset = Dataset.from_dict(data_samples)
|
|
|
|
# Run RAGAS evaluation
|
|
print("Running RAGAS evaluation...")
|
|
metrics = [faithfulness, answer_correctness, context_precision]
|
|
|
|
result = evaluate(ragas_dataset, metrics=metrics)
|
|
|
|
# Convert results to DataFrame and save
|
|
df_result = result.to_pandas()
|
|
|
|
# Add run_id to the results DataFrame
|
|
df_result["run_id"] = run_id
|
|
|
|
print("\nRAGAS Evaluation Results:")
|
|
print(df_result)
|
|
|
|
# Add more detailed analysis
|
|
print("\nDetailed Metric Analysis:")
|
|
# First ensure all metric columns are numeric
|
|
for column in ["faithfulness", "answer_correctness", "context_precision"]:
|
|
if column in df_result.columns:
|
|
try:
|
|
# Convert column to numeric, errors='coerce' will replace non-numeric values with NaN
|
|
df_result[column] = pd.to_numeric(df_result[column], errors="coerce")
|
|
# Calculate and print mean, ignoring NaN values
|
|
mean_value = df_result[column].mean(skipna=True)
|
|
if pd.notna(mean_value): # Check if mean is not NaN
|
|
print(f"{column}: {mean_value:.4f}")
|
|
else:
|
|
print(f"{column}: No valid numeric values found")
|
|
except Exception as e:
|
|
print(f"Error processing {column}: {e}")
|
|
print(f"Values: {df_result[column].head().tolist()}")
|
|
|
|
# Include run_id in the output filename if not explicitly provided
|
|
if output_file == "ragas_results.csv":
|
|
# Get just the filename without extension
|
|
base_name = output_file.rsplit(".", 1)[0]
|
|
output_file = f"{base_name}_{run_id}.csv"
|
|
|
|
# Save results
|
|
df_result.to_csv(output_file, index=False)
|
|
print(f"Results saved to {output_file}")
|
|
|
|
return df_result, run_id
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
|
|
print(f"Error in evaluation: {e}")
|
|
traceback.print_exc()
|
|
print("Exiting due to error.")
|
|
return None
|
|
|
|
|
|
def main():
|
|
"""Command-line entry point"""
|
|
parser = argparse.ArgumentParser(description="Run RAGAS evaluation on Morphik using HotpotQA dataset")
|
|
parser.add_argument("--samples", type=int, default=5, help="Number of samples to use (default: 5)")
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
default="ragas_results.csv",
|
|
help="Output file for results (default: ragas_results.csv)",
|
|
)
|
|
parser.add_argument(
|
|
"--run-id",
|
|
type=str,
|
|
default=None,
|
|
help="Specific run identifier to use (default: auto-generated UUID)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
run_evaluation(num_samples=args.samples, output_file=args.output, run_id=args.run_id)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|