Update project structure and add experiment files

This commit is contained in:
james-m-jordan 2025-05-07 14:28:10 +00:00
parent 1ee288e07b
commit 4ab492fffd
43 changed files with 2542 additions and 959 deletions

15
.cursor/prompt.md Normal file
View File

@ -0,0 +1,15 @@
You are LabAgent v2.
When the user asks to create or update an experiment you MUST:
1. Gather: aim, project, plate id, full plate layout, sample-prep, downstream-app, researcher.
2. Call create_experiment_with_plate (or update) using the YAML schema above.
3. For every lab step (seed, transfect, prep, assay) call add_experiment_task → a GitHub Issue is created and appended under tasks: with id and status: open.
4. After each Issue is created, use the gh CLI to also add it to the GitHub Project "Lab Tasks Board".
5. Commit the new/updated YAML and data files. Commit message pattern:
Experiment: <action> EXP-XXXX <short title>
6. When the user runs lab record or drops a file into Data/EXP-XXXX/, call record_data to append data:.
7. On "finish experiment" requests:
• verify that all tasks.status == closed and at least one data: entry exists;
• set status: completed;
• commit and update SESSION_LOG.md.
8. Never ask for confirmation when data can be auto-inferred (Smart-Fill); ask only when a required field is ambiguous.
9. Keep chat concise; rely on repository state as the single source of truth.

View File

@ -0,0 +1,11 @@
FROM mcr.microsoft.com/devcontainers/python:3.10
# Install system dependencies
RUN apt-get update && \
apt-get install -y git curl && \
rm -rf /var/lib/apt/lists/*
# Set up a working directory
WORKDIR /workspaces/lab-docs
# (Optional) Install additional tools or dependencies here

View File

@ -13,7 +13,7 @@
"containerEnv": {
"OPENAI_API_KEY": "${localEnv:OPENAI_API_KEY}"
},
"postCreateCommand": "pip install -r requirements.txt && chmod +x .devcontainer/jupyter-setup.sh && .devcontainer/jupyter-setup.sh && chmod +x .devcontainer/windsurf-auth-setup.sh",
"postCreateCommand": "pip install -r requirements.txt && pip install -e Agent && chmod +x .devcontainer/jupyter-setup.sh && .devcontainer/jupyter-setup.sh && chmod +x .devcontainer/windsurf-auth-setup.sh",
"postStartCommand": "bash .devcontainer/postStartCommand.sh && WINDSURF_AUTH_TOKEN='eyJhbGciOiJSUzI1NiIsImtpZCI6IjU5MWYxNWRlZTg0OTUzNjZjOTgyZTA1MTMzYmNhOGYyNDg5ZWFjNzIiLCJ0eXAiOiJKV1QifQ.eyJwaWN0dXJlIjoiaHR0cHM6Ly9hdmF0YXJzLmdpdGh1YnVzZXJjb250ZW50LmNvbS91LzE5NDMyOTQ4ND92PTQiLCJpc3MiOiJodHRwczovL3NlY3VyZXRva2VuLmdvb2dsZS5jb20vZXhhMi1mYjE3MCIsImF1ZCI6ImV4YTItZmIxNzAiLCJhdXRoX3RpbWUiOjE3NDY1NzA2ODAsInVzZXJfaWQiOiJNTVJZY3NVb3ZVaFNVc2RhMkhrd1J4UE5tTWQyIiwic3ViIjoiTU1SWWNzVW92VWhTVXNkYTJIa3dSeFBObU1kMiIsImlhdCI6MTc0NjU3MDY4MSwiZXhwIjoxNzQ2NTc0MjgxLCJlbWFpbCI6ImppbUBqb3JkYW5sYWIub3JnIiwiZW1haWxfdmVyaWZpZWQiOmZhbHNlLCJmaXJlYmFzZSI6eyJpZGVudGl0aWVzIjp7ImdpdGh1Yi5jb20iOlsiMTk0MzI5NDg0Il0sImVtYWlsIjpbImppbUBqb3JkYW5sYWIub3JnIl19LCJzaWduX2luX3Byb3ZpZGVyIjoiZ2l0aHViLmNvbSJ9fQ.kgY8pNERRK3d2weIc4eR0EI_JKfATIxE9b6ACKdQE1XVwU26_p07PGfvdWo4ty8oXjCdUdnUiprM1LKaT2yZPGnleOdeJtH31Ua1DNc7hNTEideMeTyZUAOXv6O1VJdXqpcRfjc5Q5JxEjJdj1cGdomFA1c_kn3VbGyL8BsAfH6Sg6q7fB4eRRQ5MlHPBDxQl7neHsdDVGhqGprRnWfJsOI0PJhsWC4jzSBM5HO3uFKOnl_9-BdGY_zN6j_uRcFXfHB3VxfSecinepjz3u5fdmEd71YpGJNRhGXYiM7pZBhETCvA9Ri-b1Jh74dsoAyPfmnDOdt0c0xBF2TmPEPgZA' .devcontainer/windsurf-auth-setup.sh && jupyter notebook --no-browser --ip=0.0.0.0 --port=8888 --NotebookApp.token='' --NotebookApp.password='' --allow-root &",
"extensions": [
"ms-python.python",
@ -23,7 +23,10 @@
"ms-toolsai.vscode-jupyter-cell-tags",
"ms-toolsai.jupyter-keymap",
"ms-toolsai.jupyter-renderers",
"windsurf-dev.windsurf"
"windsurf-dev.windsurf",
"github.vscode-pull-request-github",
"ms-vscode.github-issues-prs",
"gruntfuggly.todo-tree"
],
"forwardPorts": [8888],
"remoteUser": "vscode",

View File

@ -0,0 +1,39 @@
{
"name": "Lab Management Codespace",
"build": {
"dockerfile": "Dockerfile"
},
"settings": {
"terminal.integrated.shell.linux": "/bin/bash",
"python.defaultInterpreterPath": "/usr/local/bin/python",
"jupyter.alwaysTrustNotebooks": true,
"workbench.startupEditor": "none"
},
"postCreateCommand": "pip install -r requirements.txt && chmod +x .devcontainer/jupyter-setup.sh && .devcontainer/jupyter-setup.sh && chmod +x .devcontainer/windsurf-auth-setup.sh",
"postStartCommand": "bash .devcontainer/postStartCommand.sh && WINDSURF_AUTH_TOKEN='eyJhbGciOiJSUzI1NiIsImtpZCI6IjU5MWYxNWRlZTg0OTUzNjZjOTgyZTA1MTMzYmNhOGYyNDg5ZWFjNzIiLCJ0eXAiOiJKV1QifQ.eyJwaWN0dXJlIjoiaHR0cHM6Ly9hdmF0YXJzLmdpdGh1YnVzZXJjb250ZW50LmNvbS91LzE5NDMyOTQ4ND92PTQiLCJpc3MiOiJodHRwczovL3NlY3VyZXRva2VuLmdvb2dsZS5jb20vZXhhMi1mYjE3MCIsImF1ZCI6ImV4YTItZmIxNzAiLCJhdXRoX3RpbWUiOjE3NDY1NzA2ODAsInVzZXJfaWQiOiJNTVJZY3NVb3ZVaFNVc2RhMkhrd1J4UE5tTWQyIiwic3ViIjoiTU1SWWNzVW92VWhTVXNkYTJIa3dSeFBObU1kMiIsImlhdCI6MTc0NjU3MDY4MSwiZXhwIjoxNzQ2NTc0MjgxLCJlbWFpbCI6ImppbUBqb3JkYW5sYWIub3JnIiwiZW1haWxfdmVyaWZpZWQiOmZhbHNlLCJmaXJlYmFzZSI6eyJpZGVudGl0aWVzIjp7ImdpdGh1Yi5jb20iOlsiMTk0MzI5NDg0Il0sImVtYWlsIjpbImppbUBqb3JkYW5sYWIub3JnIl19LCJzaWduX2luX3Byb3ZpZGVyIjoiZ2l0aHViLmNvbSJ9fQ.kgY8pNERRK3d2weIc4eR0EI_JKfATIxE9b6ACKdQE1XVwU26_p07PGfvdWo4ty8oXjCdUdnUiprM1LKaT2yZPGnleOdeJtH31Ua1DNc7hNTEideMeTyZUAOXv6O1VJdXqpcRfjc5Q5JxEjJdj1cGdomFA1c_kn3VbGyL8BsAfH6Sg6q7fB4eRRQ5MlHPBDxQl7neHsdDVGhqGprRnWfJsOI0PJhsWC4jzSBM5HO3uFKOnl_9-BdGY_zN6j_uRcFXfHB3VxfSecinepjz3u5fdmEd71YpGJNRhGXYiM7pZBhETCvA9Ri-b1Jh74dsoAyPfmnDOdt0c0xBF2TmPEPgZA' .devcontainer/windsurf-auth-setup.sh && jupyter notebook --no-browser --ip=0.0.0.0 --port=8888 --NotebookApp.token='' --NotebookApp.password='' --allow-root &",
"extensions": [
"ms-python.python",
"ms-azuretools.vscode-docker",
"redhat.vscode-yaml",
"ms-toolsai.jupyter",
"ms-toolsai.vscode-jupyter-cell-tags",
"ms-toolsai.jupyter-keymap",
"ms-toolsai.jupyter-renderers",
"windsurf-dev.windsurf"
],
"forwardPorts": [8888],
"remoteUser": "vscode",
"features": {
"github-cli": "latest"
},
"customizations": {
"codespaces": {
"openFiles": ["/workspaces/docs/Analysis/protocol_dashboard.ipynb"]
},
"vscode": {
"settings": {
"windsurf.authTokenPath": "~/.windsurf/auth.json"
}
}
}
}

View File

@ -0,0 +1,348 @@
"""
Code snippet to be added to agent_runner.py to handle creating experiments with multiblock markdown templates.
This would extend the existing functionality to support the richer experiment format.
"""
def handle_create_multiblock_experiment(self, args: Dict[str, Any]):
"""
Handle creation of a new experiment using the multiblock markdown template.
This creates a structured markdown file with multiple YAML frontmatter blocks
and placeholder sections for data, analysis, and next steps.
Args should include:
title (str): Experiment title
researchers (list): List of researchers involved
protocol_id (str): Protocol ID (e.g., PROT-XXXX)
protocol_name (str): Protocol name
project (str): Project name
aim (str): Brief description of experimental aim
cell_lines (list): List of cell lines used
plate_format (str): Plate format (e.g., 24-well)
condition_map (str): Map of conditions in the plate
additional_metadata (dict): Any additional metadata fields
"""
# Load the multiblock template
template_path = os.path.join("Templates", "experiment_multiblock.md")
with open(template_path, "r") as f:
template = f.read()
# Generate experiment ID if not provided
if not args.get("experiment_id"):
# Generate unique ID with date-based prefix
date_prefix = datetime.now().strftime("%Y%m%d")
existing_ids = [f for f in os.listdir("Experiments") if f.startswith("EXP-")]
existing_nums = [int(f.split("-")[1].split("_")[0])
for f in existing_ids if re.match(r"EXP-\d+", f)]
next_num = max(existing_nums) + 1 if existing_nums else 1
args["experiment_id"] = f"EXP-{next_num:04d}"
# Current date if not provided
if not args.get("date"):
args["date"] = datetime.now().strftime("%Y-%m-%d")
# Fill template with args
# This is a simple placeholder - in a real implementation, we'd handle the
# multiple frontmatter blocks more carefully
for key, value in args.items():
if isinstance(value, (str, int, float)):
template = template.replace(f"{{{{{key}}}}}", str(value))
# Generate filename with experiment ID and title
experiment_id = args.get("experiment_id")
title = args.get("title", "untitled").lower().replace(" ", "-")
filename = f"{experiment_id}-{title}.md"
out_path = os.path.join("Experiments", filename)
# Ensure unique filename if exists
i = 1
base_filename = filename
while os.path.exists(out_path):
filename = f"{experiment_id}-{title}-{i}.md"
out_path = os.path.join("Experiments", filename)
i += 1
# Create data directories for the experiment
data_dir = os.path.join("Data", experiment_id)
os.makedirs(os.path.join(data_dir, "raw"), exist_ok=True)
os.makedirs(os.path.join(data_dir, "figures"), exist_ok=True)
# Create analysis script placeholder if needed
analysis_dir = "Analysis"
os.makedirs(analysis_dir, exist_ok=True)
# Script path based on experiment type
if args.get("analysis_type") == "mRNA_stability":
script_path = os.path.join(analysis_dir, f"{experiment_id}_mRNA_stability_analysis.R")
# Here we would create a placeholder R script tailored to mRNA stability analysis
elif args.get("analysis_type") == "qPCR":
script_path = os.path.join(analysis_dir, f"{experiment_id}_qPCR_analysis.R")
# Create a placeholder qPCR analysis script
# Write the experiment file
with open(out_path, "w") as f:
f.write(template)
# Log the action
self.logger.info(f"Created multiblock experiment: {out_path}")
self.logger.info(f"Created data directories: {data_dir}/raw and {data_dir}/figures")
# Update user profile
for researcher in args.get("researchers", []):
researcher_id = researcher.replace(" ", "_").lower()
self._update_user_profile(researcher_id, "recent_experiments", experiment_id)
for cell_line in args.get("cell_lines", []):
if isinstance(cell_line, dict) and "name" in cell_line:
self._update_user_profile(researcher_id, "frequent_cell_lines", cell_line["name"])
elif isinstance(cell_line, str):
self._update_user_profile(researcher_id, "frequent_cell_lines", cell_line)
# Append to CHANGELOG.md
self.append_changelog(f"Created new multiblock experiment {experiment_id}: {args.get('title')}")
# Check if there are any experiment tasks to add to TASKS.md
if args.get("next_steps"):
self.add_experiment_tasks_to_tasklist(experiment_id, args.get("next_steps"))
return {
"experiment_id": experiment_id,
"path": out_path,
"data_dir": data_dir,
"analysis_script": script_path if "script_path" in locals() else None
}
def handle_update_multiblock_experiment(self, args: Dict[str, Any]):
"""
Handle updating an existing multiblock experiment markdown file.
Args should include:
experiment_id (str): Experiment ID to update
section (str): Section to update (metadata, sample_metadata, results, interpretation, etc.)
content (dict or str): Content to update in the section
next_steps (list, optional): Updated next steps list
status (str, optional): New experiment status
"""
experiment_id = args.get("experiment_id")
if not experiment_id:
self.logger.error("Missing experiment_id for update_multiblock_experiment.")
return
# Find experiment file
exp_dir = "Experiments"
exp_file = None
for fname in os.listdir(exp_dir):
if experiment_id in fname and fname.endswith(".md"):
exp_file = os.path.join(exp_dir, fname)
break
if not exp_file or not os.path.exists(exp_file):
self.logger.error(f"Experiment file not found for id: {experiment_id}")
return
# Read the current file content
with open(exp_file, "r") as f:
content = f.read()
# Process updates - this is a simplified example
# A real implementation would parse the Markdown and YAML blocks properly
section = args.get("section")
section_content = args.get("content")
# Handle status updates
if args.get("status"):
new_status = args.get("status")
# Update status in the YAML frontmatter
status_pattern = r"status: .*"
content = re.sub(status_pattern, f"status: {new_status}", content)
# Handle next steps updates
if args.get("next_steps"):
# Locate the Next Steps section and replace it
next_steps_pattern = r"# 5⃣ Next Steps ✅.*?(?=# 6⃣|$)"
next_steps_content = "# 5⃣ Next Steps ✅\n_Check boxes when complete. These can auto-update TASKS.md._\n\n"
for step in args.get("next_steps"):
checked = "x" if step.get("completed") else " "
next_steps_content += f"- [{checked}] {step.get('description')}\n"
content = re.sub(next_steps_pattern, next_steps_content, content, flags=re.DOTALL)
# Update TASKS.md based on checked items
self.update_tasks_from_experiment(experiment_id, args.get("next_steps"))
# Handle section-specific updates
if section and section_content:
if section.lower() in ["metadata", "sample_metadata", "reagents"]:
# Update YAML frontmatter blocks
# This would require more sophisticated YAML parsing in a real implementation
pass
elif section.lower() in ["results", "interpretation", "discussion"]:
# Update markdown sections
section_pattern = rf"## {section.title()}.*?(?=##|$)"
new_section = f"## {section.title()}\n{section_content}\n\n"
content = re.sub(section_pattern, new_section, content, flags=re.DOTALL)
# Write updated content back to file
with open(exp_file, "w") as f:
f.write(content)
# Log the update
self.logger.info(f"Updated multiblock experiment: {exp_file}")
# Append to CHANGELOG.md
self.append_changelog(f"Updated experiment {experiment_id}: {section if section else 'various sections'}")
# If experiment is completed, verify all required fields are present
if args.get("status") == "completed":
self.validate_experiment_completion(experiment_id, exp_file)
return {
"experiment_id": experiment_id,
"path": exp_file,
"updated_section": section
}
def validate_experiment_completion(self, experiment_id, file_path):
"""Validate that a completed experiment has all required fields."""
with open(file_path, "r") as f:
content = f.read()
required_sections = [
"# 3⃣ Results & Analysis",
"# 4⃣ Interpretation"
]
missing = []
for section in required_sections:
if section not in content or re.search(rf"{section}.*?_[^_]*_", content, re.DOTALL):
# Section missing or only contains placeholder text
missing.append(section.replace("#", "").strip())
if missing:
issue_title = f"Experiment {experiment_id} missing required sections"
issue_body = f"The following required sections need to be completed: {', '.join(missing)}. Please update the experiment record."
self.handle_open_issue({"title": issue_title, "body": issue_body})
return False
# Mark related tasks as complete in TASKS.md
self.mark_experiment_complete_in_tasks(experiment_id)
return True
def update_tasks_from_experiment(self, experiment_id, next_steps):
"""Update TASKS.md based on experiment next steps."""
if not os.path.exists("TASKS.md"):
return
with open("TASKS.md", "r") as f:
tasks_content = f.readlines()
# Find experiment section in TASKS.md or create it
exp_section_idx = -1
for i, line in enumerate(tasks_content):
if experiment_id in line and "##" in line:
exp_section_idx = i
break
if exp_section_idx == -1:
# Section not found, append at the end of Lab Tasks
lab_tasks_idx = -1
for i, line in enumerate(tasks_content):
if "## Lab Tasks" in line:
lab_tasks_idx = i
break
if lab_tasks_idx != -1:
# Create new section
tasks_content.insert(lab_tasks_idx + 1, f"### {experiment_id} Tasks\n")
exp_section_idx = lab_tasks_idx + 1
else:
# Create Lab Tasks section and experiment section
tasks_content.append("\n## Lab Tasks\n")
tasks_content.append(f"### {experiment_id} Tasks\n")
exp_section_idx = len(tasks_content) - 1
# Update or add tasks under this section
updated_tasks = []
for step in next_steps:
checked = "x" if step.get("completed") else " "
updated_tasks.append(f"- [{checked}] {step.get('description')}\n")
# Find the end of the section
end_idx = len(tasks_content)
for i in range(exp_section_idx + 1, len(tasks_content)):
if tasks_content[i].startswith("##"):
end_idx = i
break
# Replace the tasks in this section
new_content = tasks_content[:exp_section_idx + 1] + updated_tasks + tasks_content[end_idx:]
with open("TASKS.md", "w") as f:
f.writelines(new_content)
self.logger.info(f"Updated {experiment_id} tasks in TASKS.md")
def mark_experiment_complete_in_tasks(self, experiment_id):
"""Mark all tasks for an experiment as complete in TASKS.md when the experiment is completed."""
if not os.path.exists("TASKS.md"):
return
with open("TASKS.md", "r") as f:
tasks_content = f.readlines()
updated = False
in_experiment_section = False
for i, line in enumerate(tasks_content):
if experiment_id in line and "##" in line:
in_experiment_section = True
continue
if in_experiment_section:
if line.startswith("##"):
# End of section
in_experiment_section = False
elif line.strip().startswith("- [ ]"):
# Unchecked task in this experiment, mark as done
tasks_content[i] = line.replace("- [ ]", "- [x]", 1)
updated = True
if updated:
with open("TASKS.md", "w") as f:
f.writelines(tasks_content)
self.logger.info(f"Marked all tasks for {experiment_id} as complete in TASKS.md")
def add_experiment_tasks_to_tasklist(self, experiment_id, tasks):
"""Add tasks from experiment next steps to TASKS.md."""
if not os.path.exists("TASKS.md"):
with open("TASKS.md", "w") as f:
f.write("# Lab Task List\n\n## Lab Tasks\n")
with open("TASKS.md", "r") as f:
content = f.read()
# Check if Lab Tasks section exists
if "## Lab Tasks" not in content:
content += "\n## Lab Tasks\n"
# Check if this experiment already has a section
if f"### {experiment_id}" in content:
# Will be handled by update_tasks_from_experiment
return
# Add new section for this experiment
new_section = f"\n### {experiment_id} Tasks\n"
for task in tasks:
new_section += f"- [ ] {task.get('description')}\n"
# Insert after Lab Tasks header
lab_tasks_idx = content.find("## Lab Tasks")
if lab_tasks_idx != -1:
insert_idx = lab_tasks_idx + len("## Lab Tasks") + 1
content = content[:insert_idx] + new_section + content[insert_idx:]
else:
content += new_section
with open("TASKS.md", "w") as f:
f.write(content)
self.logger.info(f"Added {experiment_id} tasks to TASKS.md")

81
Agent/init_extensions.py Normal file
View File

@ -0,0 +1,81 @@
#!/usr/bin/env python3
"""
Initialize VS Code extensions and set up lab environment.
Run this with 'python -m Agent.init_extensions' or 'lab init-extensions'.
"""
import os
import subprocess
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("lab-init")
def run_command(cmd, desc=None):
"""Run a shell command and log the result"""
if desc:
logger.info(f"Running: {desc}")
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode == 0:
if desc:
logger.info(f"{desc} completed successfully")
return True
else:
if desc:
logger.error(f"{desc} failed: {result.stderr}")
return False
def setup_extensions():
"""Set up VS Code extensions and GitHub CLI"""
# Verify GitHub CLI is installed
if run_command("gh --version", "Checking GitHub CLI"):
logger.info("GitHub CLI is available")
else:
logger.error("GitHub CLI is not installed or not in PATH")
return False
# Verify GitHub authentication
if run_command("gh auth status", "Checking GitHub authentication"):
logger.info("GitHub CLI is authenticated")
else:
logger.warning("GitHub CLI is not authenticated - Some features may not work")
logger.info("Use 'gh auth login' to authenticate")
# Check if GitHub Issues extension is active
if os.path.exists(os.path.expanduser("~/.vscode-server/extensions/github.vscode-pull-request-github-*")):
logger.info("GitHub Pull Requests and Issues extension is installed")
else:
logger.warning("GitHub Pull Requests and Issues extension may not be installed")
logger.info("Please check VS Code extensions")
# Create SESSION_LOG.md if it doesn't exist
if not os.path.exists("SESSION_LOG.md"):
with open("SESSION_LOG.md", "w") as f:
f.write("# Lab Session Log\n\n")
f.write("This file tracks lab activities and completed experiments.\n\n")
logger.info("Created SESSION_LOG.md")
# Create Experiments directory if it doesn't exist
os.makedirs("Experiments", exist_ok=True)
logger.info("Verified Experiments directory exists")
# Create Data directory if it doesn't exist
os.makedirs("Data", exist_ok=True)
logger.info("Verified Data directory exists")
return True
def main():
"""Main entry point"""
logger.info("Initializing LabAgent environment...")
if setup_extensions():
logger.info("✅ LabAgent environment initialized")
print("\nLabAgent is ready to use! Try asking:")
print(" - Create a new siRNA screen experiment")
print(" - Record data for experiment EXP-XXXXXX")
print(" - What experiments are in progress?")
else:
logger.error("❌ Failed to initialize LabAgent environment")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,411 @@
Implementation Plan: Chat-Driven Lab Management System
Overview and Goals
This plan outlines a smart, chat-driven lab management system for the the-jordan-lab/docs repository. The system will operate entirely within GitHub Codespaces, requiring no local setup. Users (students or researchers) will interact via natural language chat with an AI Cursor agent to manage lab protocols, experiments, and data. The agent will translate instructions into structured YAML files and perform actions through a deterministic Python task-runner. Key goals include:
• Monorepo Organization: A unified repository structure (Protocols/, Projects/, Experiments/, Data/, Templates/, etc.) to centralize protocols, experiment plans, data, and templates for easy access and traceability . This eliminates scattered notes or “its on my machine” issues by keeping all records in one place, under version control.
• Fully Containerized Environment: A .devcontainer configuration ensures a consistent, reproducible development environment on Codespaces . This allows any lab member to launch a ready-to-use workspace with all necessary tools (Python, libraries, etc.) pre-installed, avoiding manual setup.
• Cursor Chat Agent with Function Calling: Use an AI assistant (Cursor) that interprets lab instructions (e.g. “Plan a PCR experiment using antibody X next week”) and produces structured outputs. The agent employs OpenAI-style function calling to output JSON/YAML actions, enabling reliable multi-step workflows where the AI can take actions like creating files or updating records . The structured output approach ensures the AIs responses strictly conform to expected schemas for deterministic execution.
• Deterministic Task Runner: A Python-based task-runner will consume the agents JSON instructions and execute them (create/edit YAML files, commit to git, etc.). This separation guarantees that while the AI suggests actions, only the controlled code performs changes, ensuring reproducibility and preventing non-deterministic AI direct edits. Every action is logged and versioned.
• “Smart-Fill” Metadata Suggestions: The system will intelligently auto-populate metadata fields (e.g. reagents, conditions) to reduce user burden. It leverages:
1. Vector embeddings of the repositorys own content protocols and past experiment YAMLs to find relevant context and similar entries for suggestions .
2. User history to personalize suggestions (e.g. frequently used cell lines or instruments by that user).
3. Lightweight RAG (Retrieval-Augmented Generation) from trusted external sources like PubMed for domain knowledge (e.g. auto-suggesting an antibody target or concentration from literature).
• GitHub Integration & Automation: Tight integration with GitHub features for collaboration and oversight. The task-runner will auto-create Issues, Draft PRs, and commits with descriptive messages as needed. This ensures the lab PI and team have full visibility of changes and can review via normal GitHub workflows. Automation is favored (with auto-commit/auto-PR) to streamline usage, but the system will request user clarification when confidence in an action is low or ambiguous.
• Persistent Tracking & Documentation: The repository will include Markdown documents to track tasks and status across AI sessions. This allows the AI (and users) to resume work with full context after interruptions. A human-readable “LabAgent Guide” will also be provided so any team member can understand how to use the system or pick up where someone left off.
By implementing these components, the lab management process becomes centralized, reproducible, and user-friendly, turning conversational instructions into documented lab actions with minimal friction.
System Architecture Overview
1. Codespace & Dev Container: The project will use GitHub Codespaces with a custom .devcontainer. The devcontainer includes all dependencies (Python, necessary libraries, vector DB, GitHub CLI, etc.) and configuration needed to run the agent and task-runner. This guarantees every users environment is identical and ready-to-go . When a user opens the repository in a Codespace, the container builds automatically, installing the AI agent backend and tools. No local installs are required “it works on my machine” problems are eliminated by containerizing the dev environment .
2. Cursor Agent (LLM): At the core is a chat-driven AI agent (powered by a language model such as GPT-4 with function calling support). The agent runs in the Codespace (either via a VSCode extension or a CLI interface) and accepts natural language instructions. A custom function schema is defined for lab management actions (e.g. create_protocol, start_experiment, log_result, open_issue, etc.), each with a JSON schema for arguments. The agent is instructed (via system prompt) to use these functions to perform tasks rather than free-form text. For example, if the user says “Create a new protocol for cell staining with antibody ABC,” the agent will output a JSON invoking create_protocol{name: ..., type: ..., fields: ...}. This structured output approach (with JSON mode strict schemas) ensures the models output can be parsed deterministically . The Cursor agent essentially plans the steps and expresses them in a machine-readable form.
3. Python Task-Runner: A persistent Python process (started on container launch) listens for the agents function call outputs (perhaps via a file or IPC mechanism). When the agent emits a JSON action, the task-runner parses it and executes the corresponding Python function. This could be implemented with a loop reading actions from the agents API or via the Cursor MCP (Model Context Protocol) plugin interface . Each function in the task-runner is carefully written to be idempotent and deterministic performing exactly the file system or Git operation specified. For instance, create_protocol will scaffold a new YAML file in Protocols/ with the provided content, and open_issue will call the GitHub API or CLI to create an issue. After execution, the task-runner can feed a result back to the agent (e.g. success or any output) to continue multi-step workflows. This design lets the AI chain multiple actions (tool use) reliably, akin to an agentic workflow where the LLM can “think then act” in cycles until the instruction is fulfilled .
4. Data Storage (YAML & Git): All lab information is stored as human-editable text (YAML or Markdown) in the repository. The monorepo layout organizes this data (detailed in the next section). By using text-based YAML, we ensure that adding or editing entries (protocols, experiments, etc.) is transparent and trackable via Git diff. YAML is chosen for its readability and structure it serves as a simple database for lab metadata. Notably, YAML protocols are already used in some lab automation contexts as a high-level experiment language , underscoring its suitability. The Git history provides an audit trail; every change by the agent is committed with a message, so one can trace what happened when, by whom (the agent will sign off with the users name or a bot identity). This versioning also aids reproducibility: if a protocol changes over time, one can retrieve exactly what protocol was used for an experiment by referencing commit IDs or tags.
5. Smart-Fill Recommendation Engine: Alongside the agent and runner, the architecture includes a metadata suggestion subsystem. This is composed of:
• An Embedding Index built from the repositorys contents. On Codespace startup (or on demand), a script scans files in Protocols/ and sample YAML entries from Experiments/ and Projects/, converting them into vector embeddings. This uses a pre-trained text embedding model (e.g. SentenceTransformers or OpenAI embeddings). The result is a lightweight vector database (possibly just an in-memory FAISS index or a local SQLite via ChromaDB) that can be queried for similarity.
• A History Tracker that maintains context about the current user and recent actions. For example, if user “Alice” has recently done several cell culture experiments, the system notes the cell lines or treatments she used frequently. This could be as simple as storing a JSON with per-user stats (e.g. last used protocol, commonly used reagent names) updated after each session.
• External Knowledge Fetcher: A minimal integration to query external databases like PubMed or protocols.io when needed. This will not be a persistent heavy service, but rather an API call using something like Biopythons Entrez or a requests call to NCBI E-utilities. For instance, if a user asks for a protocol that includes a specific antibody, and the agent doesnt find it in local files, it might query PubMed for that antibody to gather suggested usage or metadata (publication info, recommended dilutions, etc.). Only trusted sources (PubMed, DOI resolvers) are used to avoid misinformation.
All these components come together such that a user can simply describe what they want, and the system handles the plan → action → record → commit cycle automatically. Below, we detail the implementation plan for each part of this architecture.
Repository Structure and Scaffolding
We will organize the-jordan-lab/docs as a monorepo containing all relevant subfolders for protocols, experiments, data, etc. The following folders (and key files) will be created in the root of the repository:
• Protocols/ contains standard operating procedures and experimental protocols. Each protocol is a YAML (or Markdown + YAML front-matter) file describing a repeatable procedure. Convention: Use a short descriptive name for the file, plus a version or date if needed (e.g. Protocols/cell_staining_v1.yaml). These files include fields like name, description, steps, materials, version, author, etc. For example, a protocol YAML might look like:
name: Cell Staining Protocol
id: PROT-0001
description: Protocol for immunofluorescence staining of cells
author: Alice Smith
created: 2025-05-05
version: 1.0
materials:
- Antibody: Anti-XYZ (1:500 dilution)
- Stain: DAPI
- Buffer: PBS 1X
steps:
- "Fix cells with 4% PFA for 10 minutes."
- "Wash 3x with PBS."
- "Add primary antibody (Anti-XYZ) for 1 hour at RT."
- "Wash 3x with PBS."
- "Add DAPI stain for 5 minutes."
- "Wash and image."
notes: |
This protocol is derived from Doe et al. 2023.
Each protocol gets a unique ID or name. We will include a brief README.md in Protocols/ explaining how to add new protocols via the agent and the YAML schema expected (for human reference).
• Projects/ groups related experiments under broad project titles (e.g., a project might be “Tumor Growth Study 2025”). Each subfolder or YAML file in Projects/ outlines a projects goals, team, and links to relevant experiments. We may use one YAML per project (e.g. Projects/tumor_growth_2025.yaml) containing metadata: title, description, lead, team_members, associated_protocols, and a list of experiment IDs under it. This helps organize experiments logically (one project to many experiments relationship).
• Experiments/ records of individual experiments or lab sessions. Each experiment is a YAML file (or folder) that captures the plan, execution, and outcome of an experiment. Convention: We use a timestamp or incremental ID in the filename for uniqueness, possibly prefixed by project or user. For example, Experiments/2025-05-10_cell_staining_Alice.yaml or Experiments/EXP-0002.yaml. The YAML fields include:
• Reference to Protocol: e.g. protocol: cell_staining_v1 (which correlates with a file in Protocols/). If a protocol is modified for this experiment, the changes can be noted in a deviations: field.
• Parameters/Metadata: e.g. date, researcher, sample_id, reagents_used, instrument, settings, etc. The agents Smart-Fill will attempt to populate these. For instance, if protocol is known and has expected reagents, it can auto-fill the reagents_used list.
• Procedure Steps: Optionally, a list of steps (could be auto-copied from the protocol for completeness, then annotated with any changes).
• Results: free-form notes or links to data outputs (if small data, possibly included; if large, stored in Data/).
• Status: e.g. status: ongoing or completed or planned to track the state.
• Links: to project, or related experiments.
Example snippet for an experiment YAML:
experiment_id: EXP-0002
project: Tumor Growth Study 2025
title: Staining Tumor Cells with Anti-XYZ
date: 2025-05-10
researcher: Alice Smith
protocol: Cell Staining Protocol (v1.0)
materials:
Antibody: Anti-XYZ (lot #12345)
Cell line: HeLa
parameters:
Cell_count: 1e5
Incubation_time: 60 # minutes
results:
images: ["Data/Images/exp0002_image1.png", "Data/Images/exp0002_image2.png"]
observations: "Strong fluorescence observed in nucleus."
status: completed
The plan will include scaffolding a template experiment YAML in Templates/ (see below) that lists all required fields, which the agent can clone and fill for each new experiment to ensure completeness.
• Data/ storage for data outputs or references to data. Large raw data might reside outside Git (e.g. on cloud or a drive), but small data files or processed results can be saved here. We will organize subfolders by experiment or project, for example Data/Images/EXP-0002/ for images from experiment 0002, or Data/Sequencing/ProjectX/... etc. If data is external, the YAML records in Experiments can contain pointers (URLs or filesystem paths) to where the data is stored. A README.md in Data/ will clarify how to add data or link external data (the agent could automate adding placeholders or verifying links).
• Templates/ contains starter templates for various YAML structures (protocol, experiment, project). For instance:
• Templates/protocol_template.yaml with all fields blank or example values.
• Templates/experiment_template.yaml with required sections (and perhaps comments).
• Templates/project_template.yaml.
The Cursor agents task-runner will use these templates when scaffolding new files to ensure consistency. Deterministic conventions (like which keys to include and in what order) come from these templates, so all YAML files follow a standard format. This reduces variability and makes it easier to parse or validate entries later.
• Agent/ (or automation/ or similar) this folder will hold the code for the AI agent integration. E.g., a Python module agent_runner.py for the task-runner, any utilities for embedding or PubMed queries, and perhaps prompt templates for the LLM. Keeping this code in the repo means its versioned and can be improved via pull requests like any other code. This folder can also include the function definitions (possibly in JSON format or as Python descriptors) that define the interface between the LLM and the functions.
• Root files: In the repository root, well add:
• A detailed README.md explaining the repository purpose and structure. It will outline each directory and how they fit into lab workflows (essentially summarizing parts of this plan for end-users). It will emphasize that this is an electronic lab notebook / management system and how to get started with it in Codespaces.
• Documentation files for the agent system:
• LAB_AGENT_GUIDE.md (name tentative): Documentation for users on how to interact with the chat agent. For example, how to phrase requests, what the agent can do, and tips (like “you can ask the agent to show an experiment summary or to search protocols by keyword”). It will also describe the fallback behavior (when the agent might ask questions) so users know what to expect.
• TASKS.md: A Markdown task tracker (details in a later section) listing outstanding development tasks or lab to-dos. This might be used both for continuing the implementation of the system and for high-level lab tasks that the AI can help manage. The idea is to enable the AI (and humans) to see a to-do list and mark items done across sessions.
• CHANGELOG.md or STATUS_LOG.md: A log that the system (and users) update each session to summarize what was done. For example, each time the agent runs a major command, it appends “2025-05-10: Created experiment EXP-0002 (Cell Staining) via agent for Alice.” Keeping this log in markdown ensures that if the conversation context is lost, the next session can quickly reconstruct it by reading the recent log. It also provides the PI a quick way to see recent activity at a glance without combing through individual commits.
Scaffolding these folders and files will be the first step. We will create stub files (even if empty or with placeholder text) for templates and documentation so that everything is in place. With this deterministic structure, the AI agent always knows where to put or find things. For example, when asked to create a new protocol, it knows to place a YAML in Protocols/ and update any relevant index or list.
This monorepo approach centralizes all experimental knowledge. As Labguru advertises, centralizing experiments, protocols, and data in one hub improves collaboration and eliminates lost information . Our structure echoes that philosophy: experiments link to protocols and data, projects link to experiments, etc., all under one version-controlled roof.
Codespaces Environment and DevContainer Setup
To make the system fully operational inside GitHub Codespaces, we define a .devcontainer configuration in the repository. This includes at minimum a devcontainer.json and a Dockerfile or image specification that sets up:
• Base Image & OS: Use an image like mcr.microsoft.com/devcontainers/python:3.10 (for Python environment) or a lightweight Ubuntu with Python. The image should have git and basic tools.
• Python Environment: Install Python 3 and required pip packages. These likely include:
• OpenAI SDK (for calling the GPT API, if using OpenAIs service for the agent).
• LangChain or similar (optional, for structured output handling or vector store management).
• faiss-cpu or chromadb (for embeddings storage and similarity search).
• PyYAML (for reading/writing YAML files).
• GitPython or GitHubs gh CLI (to automate Git and GitHub actions if not using direct CLI).
• biopython (for PubMed Entrez API) or requests for external queries.
• Any Cursor-specific agent library if needed (if Cursor provides a Python package to interface with MCP or agent API).
• Possibly small utility libraries for text processing, etc.
• VS Code Extensions: The devcontainer can recommend/install extensions such as:
• GitHub Codespaces / Dev Containers extension (usually default).
• YAML extension for nice editing.
• Python extension for coding.
• If available, a Cursor extension or GitHub Copilot Chat anything to facilitate the chat interface in the VSCode environment. If Cursor has an extension for VSCode, include it (or instructions to connect to Cursor).
• Environment Variables: Well configure any needed environment vars. For example, OPENAI_API_KEY (which the user would supply via Codespaces secrets for security). Or a GITHUB_TOKEN (Codespaces provides a token by default for the repo, which can be used with gh CLI to auth to that repositorys scope). The devcontainer might include an .env or use the Codespaces secrets to ensure the agent can authenticate to required services (OpenAI, GitHub).
• Post-create Setup: Use the postCreateCommand to run setup tasks, such as:
• Index the repository content for embeddings (so the vector store is ready).
• Possibly launch the agent backend. For example, start the Python task-runner or MCP server. We might run a command like python agent/agent_runner.py --serve & to have it listening in the background.
• Run any migrations or checks (e.g., ensure the Templates folder has the latest schema, or verify YAMLs).
• Print a welcome message with next steps (maybe a reminder to open the chat interface).
Once configured, any contributor can open a Codespace and within minutes have a fully functional AI-assisted lab notebook environment. The reproducibility is key: “Dev Containers… ensure every developer uses the same environment, eliminating the works on my machine problem” . In our case, it ensures every student in the lab has the same tools and sees the same AI behavior.
Notably, no local installation is necessary. If someone prefers local development, they could use VS Code with the Remote - Containers extension to instantiate the same devcontainer locally. But the target is to use GitHub Codespaces in the cloud for ease. This means even on an iPad or a low-power laptop, a user can access the full system via a browser.
We also ensure that the Codespace has no external server dependencies: all logic runs inside (the LLM calls go out to OpenAI or are handled by Cursors service, but we are not hosting our own server outside). The agent and task-runner run within the container. Theres no need to deploy separate web services or databases we rely on the GitHub platform (issues, PRs) for collaboration and lightweight local stores (YAML files, embeddings index) for data.
Cursor Agent & Function-Calling Task Orchestration
The Cursor agent is the AI assistant that interprets user instructions and decides which actions to perform. To implement this reliably, we will use OpenAIs function calling JSON protocol (or an equivalent) to constrain the agents output to a set of pre-defined actions . This ensures determinism and safety the agent cant execute arbitrary code or make changes unless its through one of our vetted functions.
Defining Functions (Actions): We enumerate the main actions the agent should handle in a JSON schema format to register with the LLM. For example:
• create_protocol(name: str, purpose: str, steps: list, materials: list, version: str): Creates a new protocol YAML in Protocols/. The agent will fill in fields like steps and materials if provided (or use template placeholders).
• create_experiment(project: str, protocol: str, date: str, researcher: str, parameters: dict): Creates a new experiment record in Experiments/. The agent should supply a unique ID or date for the experiment. Many of these arguments can come from conversation (or be guessed by Smart-Fill).
• update_experiment(id: str, results: str): Log results or update status of an experiment.
• suggest_metadata(field: str, context: str): A special function where the agent can call into the Smart-Fill system. This might trigger the Python side to do an embedding lookup or external search and return suggestions. (This could also be handled implicitly by the agents knowledge, but having a tool function allows deterministic retrieval from external sources.)
• open_issue(title: str, body: str): Creates a GitHub Issue for tracking. E.g., if a user says “Flag this for PI review,” the agent might call this to open an issue.
• open_pr(branch: str, title: str, body: str): Creates a Draft Pull Request. Typically used after significant changes (like adding multiple files for a new project) to request review. The task-runner can gather all uncommitted changes on a branch and push them, then open PR.
• commit_and_push(message: str): Commits current staged changes with the given message and pushes to GitHub. (The workflow could be commit after each atomic action or batch commit; we will likely commit at logical milestones to keep history readable.)
These function definitions (with their parameter schema and documentation strings) will be given to the LLM in the system message or via the Cursor MCP interface. This way, when the user instructs something that maps to an action, the model will choose to output a function call JSON.
Example Interaction:
User: “Im starting a new experiment tomorrow using the cell staining protocol on sample 123, please set it up.”
System/Agent (internally): The agent parses this and decides it needs to create a new experiment entry. It gathers details: protocol = “cell staining protocol”, date = tomorrows date, researcher = (from user profile or ask), sample id = 123. It might also recall from embeddings that “cell staining protocol” expects an antibody and cell line, and use Smart-Fill to guess or ask. Finally, it responds not in natural language but with a function call, e.g.:
{
"function": "create_experiment",
"arguments": {
"project": "Unassigned",
"protocol": "Cell Staining Protocol (v1.0)",
"date": "2025-05-06",
"researcher": "Alice",
"parameters": {
"sample_id": "123",
"cell_line": "HeLa",
"antibody": "Anti-XYZ"
}
}
}
The Python task-runner receives this JSON and executes create_experiment.
Python Task-Runner Implementation: In agent/agent_runner.py, we will implement a loop (or use an asynchronous event system if integrated with Cursors MCP) to handle incoming function calls. For each function:
• Log the action to console or a log file (for debugging/audit).
• Perform the action:
• For file creation, use template files from Templates/, fill in the YAML fields from args, and write to the appropriate path. Mark the new file with a new experiment ID if not provided (the runner can generate the next ID by simple increment or timestamp).
• For updates, open the YAML, modify the necessary fields, and save.
• For suggestion retrieval (if suggest_metadata is called), call the embedding search or PubMed API accordingly (more details in Smart-Fill section) and return the results to the agent. The agent might then decide which suggestion to use and continue the function calls.
• For Git operations, use the GitHub CLI (gh) or a library. For example, open_issue can run gh issue create -t "title" -b "body", or use PyGithub if we prefer Python. Similarly, commit_and_push can run shell git commands or use a library to commit and push.
• Handle errors gracefully: if something fails (e.g. file already exists, or network error opening an issue), catch it and send a message back to the agent (the LLM) indicating failure. The agent can then relay to user or attempt a different approach. Ensuring error messages are succinct and useful (e.g. “Failed to create file, it may already exist”) will help if the AI or user needs to intervene.
After each significant action, the task-runner can optionally call back to the agent with a brief result. For example, after create_experiment, it could respond with a message like: “Experiment EXP-0002 created in Experiments/. Ready for additional details.” or supply the new experiment ID as return value. This can be done via the OpenAI function calling mechanism by returning a value. The agent might use that to inform its next message to the user.
The Cursor MCP route: If using Cursors Model Context Protocol, we would implement an MCP server in the container that Cursor (the editor) connects to. This MCP server would expose endpoints corresponding to our functions. The advantage of MCP is tighter integration (Cursor can call them as if the AI decided to), and it allows using Cursors UI features. Either approach (OpenAI API or MCP) results in similar behavior. Since the question references “Cursors function-calling JSON protocol,” it suggests we can use either OpenAIs API with JSON mode or Cursors own mechanism. We will plan for OpenAIs API usage for generality, but note that the devcontainer can support running Cursors own environment if needed.
Determinism and Schema Enforcement: We will use the strict: true setting for structured outputs if available . This means the model is required to produce exactly the JSON schema for function arguments. The benefit is 100% reliable parsing of outputs into our functions (no misformatted JSON) . By constraining the AI to these schemas, we essentially get a guarantee that the agents “thoughts” manifest as concrete, reproducible actions in our system, not just suggestions.
Multi-step Workflows: The users request may require multiple steps. The agent can call a sequence of functions in a single conversation turn or multiple turns. For instance, “Set up a new project for RNA extraction and create two experiments in it” might lead to:
1. create_project (makes a project YAML).
2. create_experiment (for experiment 1 under that project).
3. create_experiment (for experiment 2).
4. open_pr (open a draft PR with all these new files, maybe tagging the PI).
The agent will do this stepwise, possibly asking for confirmation or missing info in between. The task-runner will execute each in order. If any step requires more info (say the project description wasnt provided), the agent could either guess (Smart-Fill could provide a generic description) or ask the user for that detail before proceeding.
This agent-runner loop essentially forms an automated lab assistant. Its important that by default, it tries to fill in blanks automatically (using intelligent defaults or suggestions) to avoid pestering the user. Only when something is truly ambiguous or critical (confidence is low) will it pause to ask (see Automation vs Clarification below for strategy).
Security Considerations: All actions are local to the repo or via GitHub API with the users token, so theres minimal risk. We ensure the agent cannot execute arbitrary shell commands beyond our allowed functions. It also should not have internet access beyond what we explicitly code (like PubMed queries), which prevents it from doing unapproved external calls or data leak. (In a production setting, wed further sandbox this if needed.)
Smart-Fill Metadata Suggestion System
One highlight of this system is Smart-Fill, which reduces the manual effort in providing complete metadata for protocols and experiments. This system combines local knowledge and external references to suggest likely values for fields.
A. Vector Embeddings of Repo Content:
We will preprocess the content of our protocols and templates to create an embedding index. For example:
• For each protocol in Protocols/, compute an embedding of its text (including name, description, steps). Store these in an index keyed by protocol ID.
• For each existing experiment YAML, embed its contents or at least key fields (title, protocol used, materials, results summary).
• For each project, embed its description and scope.
• Possibly also embed any lab glossary or inventory if available (e.g., list of antibodies the lab commonly uses, list of cell lines, etc. this could simply be another YAML file we maintain).
Well use a model like OpenAIs text-embedding-ada-002 or a local alternative (to avoid external calls, maybe a locally hosted MiniLM or SBERT model). The embeddings (vectors) are stored in memory or a small database file. The suggest_metadata function in our runner can query this index with a question or partially known info.
Use Cases of Embeddings:
• Protocol Suggestion: If a user describes an experiment aim but doesnt specify a protocol, the agent can search the protocol embeddings for relevant ones. E.g., user says “I want to count cells after treatment” the agent finds a “Cell Counting” protocol in the index as a likely match.
• Parameter Guessing: If an experiment is created with a known protocol, the agent can look up similar experiments (via embeddings of experiment descriptions) to see what parameters were used. For instance, if doing “cell staining on HeLa cells,” and previously an experiment had cell_line: HeLa and used antibody X at 1:500, it might suggest the same antibody and dilution if context matches.
• Preventing omissions: The vector search can be used to ensure completeness. Suppose an experiment YAML is missing the antibody field but protocol suggests one the agent can notice from the protocol text that an antibody is needed and prompt or auto-fill it.
• Contextual answers: If the user asks a question like “What was the result of the last experiment using protocol Y?”, the agent can embed the query and find the relevant experiment record to answer.
Because these suggestions come from the labs own data, they are highly relevant and help maintain consistency. As one reference notes, “embeddings help LLMs generate more precise responses by retrieving contextually relevant information” . We are applying that by feeding the agent with relevant snippets when needed. The agent can incorporate retrieved data either directly into its function arguments or as additional context in the conversation.
B. Per-User Activity History:
We will maintain a simple log or profile for each user (perhaps identified by their GitHub username or a provided name). This could be a JSON in Agent/user_profiles.json with entries like:
{
"alice": {
"last_active": "2025-05-05",
"frequent_protocols": ["Cell Staining Protocol", "Flow Cytometry Protocol"],
"frequent_samples": ["HeLa", "Mouse fibroblast"],
"recent_experiments": ["EXP-0002", "EXP-0005"]
},
"bob": {
...
}
}
This data can be updated automatically: each time an experiment is created, add to that users recent experiments; each time they use a protocol, increment a counter. The agent can use this to tailor suggestions. For example, if Alice usually works with HeLa cells, the agent might default to cell_line: HeLa in a new experiment unless told otherwise. Or for Bob, maybe default to a different cell type.
This personalization makes the agent feel more “assistant-like” and further reduces repetition. It also helps disambiguation: if two possible protocols fit a request but one is the users favorite, pick that.
C. External RAG (e.g. PubMed):
For expanding beyond the labs internal knowledge, we integrate a minimal retrieval from external sources:
• PubMed queries: We can use NCBIs API to fetch article titles or abstracts related to a keyword. For instance, if a user mentions a gene or compound unknown to our system, the agent can do suggest_metadata("What is XYZ compound used for?") which our runner handles by querying PubMed for “XYZ compound protocol” or similar. The results (top 1-3) could be returned to the agent, which might glean that “Compound XYZ is often used as a staining agent in concentration 5 µM ” etc. The agent can then use that info to fill in details or cite sources.
• Protocols.io or other repositories: If internet access is allowed, the agent can search protocols.io via API (SciNote integration suggests many protocols are easily importable ). We wont focus on heavy integration due to time, but in future, the agent could fetch a template protocol from protocols.io if the lab doesnt have one internally.
• Safety and Trust: Only use well-known databases (PubMed, maybe ArXiv for methods) to avoid retrieving from random web sources. The assistant should cite or log any external info used, for transparency. Perhaps in the experiment notes it can add “Suggestion from PubMed: [citation]”.
Smart-Fill Workflow:
When the agent is about to call a function but lacks some info, it has options:
1. Check embeddings: e.g., it has protocol name and wants default parameters it queries similar experiments/protocols.
2. If embeddings yield clear result, fill the field automatically and proceed.
3. If still uncertain, attempt an external query if appropriate (e.g., unknown term).
4. If still uncertain or multiple possibilities, ask the user. For example: “I see two possible antibodies (Anti-ABC and Anti-XYZ) used in similar contexts. Which one are you using?” The agent would only reach this step if automation fails to give a confident answer.
Confidence can be gauged by embedding similarity score or by whether a field is critical and no data found. Well design simple thresholds (e.g., if cosine similarity > 0.8 for a suggestion, assume its good; if less, ask). This aligns with the requirement: prefer automation and auto-confirmation, with clarification only when confidence is low.
Metadata Completeness:
Upon creating or updating any YAML, the task-runner can include a validation step. Using a template or schema (possibly defined in Templates/ or via a JSON Schema in code), verify required fields are present (and not placeholders). If something is missing, the runner can prompt the agent for a follow-up. E.g., if after filling everything an experiment YAML still has cell_line: TBD or no results when marked completed, it can alert the agent or user. This ensures high-quality records. Essentially, the Smart-Fill tries to ensure that by the time an experiment is marked “completed,” all fields (like date, protocol, materials, results, etc.) are filled and meaningful. This emphasis on completeness and accuracy improves reproducibility future lab members can read the record and know exactly what was done .
As an example of Smart-Fill in action: A student says “I treated cells with DrugX for 24 hours.” The agent creates an experiment entry. The Smart-Fill might detect “DrugX” is not in our protocols, so it queries PubMed. It finds an article about DrugX usage. From that, it guesses the treatment concentration might be, say, 10 µM and the cell line used was MCF-7 (just hypothetical). It can fill treatment: DrugX 10µM for 24h and perhaps add a note “(suggested parameters from Doe et al. 2024)”. If the student or PI later sees this and wants to adjust, they can but the system provided a starting point automatically, which might otherwise require searching literature manually.
By combining internal data and external references, the lab agent becomes a proactive assistant, not just a passive recorder. It helps novices fill out forms correctly and reminds experienced users of details they might overlook.
GitHub Integration: Issues, PRs, and Commits
To integrate with the labs existing workflow and ensure PI visibility, we leverage GitHub features via automation:
Automated Git Commits: Every change made through the agent is committed to the repository. We will configure the task-runner to stage and commit files at appropriate intervals. Likely approaches:
• One commit per high-level command: e.g., the user says “Record experiment results,” which triggers updating two files the experiment YAML and maybe a summary in a project file. The runner can commit both together with message "Add results for EXP-0002 (Cell Staining experiment)".
• Auto commit after creation: When a new protocol or experiment file is created, commit it immediately with message like "Create new protocol: Cell Staining Protocol v1.0" or "Log experiment EXP-0003 (staining test)".
• Structured Commit Messages: We might adopt a consistent format for commit messages to make them scannable. For example, prefix with the type of action: Protocol: for protocol additions, Experiment: for experiment updates, etc. e.g., "Experiment: EXP-0002 created for cell staining assay". We can also allow the agent to draft the commit message since it knows the context; however, to keep it deterministic, the task-runner could assemble the message from known parameters (like using the files title or ID).
• The Cursor agent or the Cursor editor might have an AI commit message feature , but since we want determinism, well rely on our own controlled messaging.
Git Branching and Pull Requests:
By default, the system could commit to the default branch (e.g., main) for immediate record-keeping. However, for oversight, we might prefer changes to go to a branch and open a PR for review. Two modes are possible:
• Direct Commit Mode: Simpler tasks (adding an experiment log, small updates) commit directly to main with the assumption that the PI trusts these incremental notes (just as theyd trust a student writing in a paper notebook). Since everything is logged, any issue can be fixed via another commit.
• Pull Request Mode: Significant or potentially sensitive changes (like adding a new protocol or a large batch of edits) trigger a Draft PR. The task-runner will create a new branch (maybe named lab-agent/<feature> or <user>-<task>), push the changes, then open a PR with a summary. The PR description can be generated by the agent, listing what was done, and perhaps tagging @PI (the PIs GitHub handle) for review. Mark it as Draft if not ready for merge. The PI can then review the changes in a familiar interface, comment, request changes, or approve and merge. The agent could monitor the PR status and report back if merged or if changes are requested (though that may be beyond MVP).
We will integrate with GitHub using either:
• GitHub CLI (gh): which is straightforward inside Codespaces. E.g. gh issue create ... or gh pr create .... Well ensure gh is authenticated (Codespaces usually provides a token). This avoids handling tokens in code.
• PyGithub or GraphQL API: a Python library to create issues/PRs programmatically. This might be slightly more complex to implement but allows more fine-grained control within our Python runner (e.g., check if an issue exists, etc.). For our plan, gh CLI is sufficient and simpler.
Issue Creation:
Issues can be used for various purposes:
• Task tracking: If the agent encounters something it cant do automatically or needs human input later, it could open an issue as a reminder. For instance, “Experiment EXP-0005 lacks results awaiting data” could be an issue assigned to the student to fill in results later.
• PI notifications: The system might open an issue to notify about a new project or a completed experiment. The PI (if subscribed) gets an email. The issue body can contain a summary, and perhaps the PI can respond with feedback right there.
• Feature requests/bugs: On the development side, if the AI fails to parse something or an error occurs, it could log it as an issue for developers to fix the agent. This way improvement needs dont get lost.
Automatic Linking: We can have the agent include references in commit messages or issue bodies to tie things together. E.g., commit message “Experiment EXP-0002… (see #12)” to refer to issue #12 about that experiments review. Or in an issue describing a project, include links to the YAML files or PRs.
Mirroring with Gitea: The plan notes that Gitea is passively mirroring, so we dont need to do anything for Gitea specifically. We just push to GitHub; the mirror container will update Gitea. So effectively, all data is also available on the labs internal Gitea server for backup. We should ensure not to use any GitHub-specific feature that doesnt mirror well. However, issues and PRs wont mirror to Gitea (since its just a git mirror). The lab should be aware that the single source of truth for issues/PRs is GitHub (or at least the PI should check GitHub, not Gitea, for those). Well clarify that in documentation.
PI Visibility & Notifications:
Once these integrations are in place, the PI can simply watch the repository on GitHub to get notifications of all commits and issues. Additionally, by involving the PI in PRs or having them assigned to oversee certain issue labels (like “review needed”), we create a workflow where nothing significant happens without the PI seeing it. The PI can also browse the Markdown logs (CHANGELOG.md or the commit history) at any time to see a chronological list of what the lab has done recently, ensuring transparency. This addresses the need for PI visibility with minimal friction: the students dont have to separately email updates or fill out reports the system automatically produces those updates in the normal course of using it.
Persistent Task Tracking and Session Management
To facilitate working across multiple sessions (since AI context is not persistent unless stored) and enable resuming work seamlessly, we will implement Markdown-based trackers and logs.
Task Tracker (TASKS.md):
This file will list ongoing tasks or implementation steps, potentially in a checklist format. There can be two sections: Development Tasks (for building out this system itself) and Lab Tasks (for actual lab work to be done via the system). The AI agent can reference and update this file. For example:
## Lab Tasks
- [x] Set up project "Tumor Growth Study 2025" (created by lab agent on 2025-05-01)
- [ ] Run Experiment EXP-0007 (cell viability assay) **in progress** (assigned to Bob)
- [ ] Analyze results of EXP-0005 and generate report *pending data*
## Development Tasks
- [x] Scaffold repository structure (done in initial commit)
- [x] Implement create_experiment function
- [ ] Integrate PubMed metadata suggestions
- [ ] Write user guide documentation
The agent can mark items as done ([x]) when completed. For instance, after it finishes integrating PubMed suggestions, it would check that off (and perhaps add a line in a commit message referencing the task). This provides continuity if the agent session ends and restarts, it can load TASKS.md to see what remains. It also helps a human collaborator see progress at a glance without diving into commit logs.
Session Log (SESSION_LOG.md or CHANGELOG.md):
This will be appended with each session or major action. We might structure it by date:
# Lab Agent Activity Log
## 2025-05-05 Session (Alice)
- Created protocol "Cell Staining Protocol v1.0" via agent.
- Created experiment EXP-0002 using "Cell Staining Protocol" for sample 123.
- Auto-filled experiment metadata (antibody Anti-XYZ, cell line HeLa).
- Committed changes and opened PR #5 for PI review.
## 2025-05-06 Session (Alice)
- Added results to EXP-0002 (observations and images).
- Marked EXP-0002 as completed. Commit abcdef1.
- PI approved PR #5 and merged.
This log provides full context to resume work at any time with full context. If the same or another user comes back a week later, they can read the latest session entry to recall what the agent did and what is pending. The agent itself, on start, can be programmed to read the last N lines of this log and incorporate that into its system message (so it knows the recent history without needing the conversation memory). This is critical because the AI models conversational memory wont persist across sessions unless explicitly given.
We will have the agent update this log as part of its workflow. Possibly, after every high-level user command is done, append a bullet in the log file summarizing it. The task-runner can facilitate this (since its safer for the runner to write to files than trusting the AI to phrase it consistently).
Resuming Context:
When starting a new conversation with the agent (say the next day), the system can:
• Inject the content of TASKS.md and the last sessions log as part of the prompt (system or assistant message) to give the AI the context of whats happening.
• The user doesnt have to repeat where they left off; they can say “Lets continue with the viability assay” and the agent will understand from the log which experiment that is referring to and what the status is.
Documentation for Continuation:
We will document this mechanism in the user guide (LAB_AGENT_GUIDE.md). For example, instruct the user: “If you come back to the project after some time, read the Lab Agent Activity Log above to recall context. You can ask the agent whats the status of my experiments? it will summarize using the log and current data. The agent keeps this log updated so you dont have to.” This way, even if a different student takes over or assists, they can quickly get up to speed.
Finally, all these markdown files (TASKS.md, SESSION_LOG.md, etc.) are also visible on GitHub, meaning the PI or any collaborator can view them outside Codespaces too. This layered documentation ensures that even outside the AI interface, the projects state is well-communicated.
Automation vs. User Clarification Strategy
To meet the requirement of preferring automation with minimal user prompts, we design the agents behavior as follows:
• Auto-Execution by Default: For any well-understood instruction, the agent will proceed to carry it out fully without asking “Are you sure?” or “Should I do it?”. It will confirm by performing the action (and the user will see the result in the repository or via a brief summary message). For instance, “Log that I added 1 µL of reagent X” -> the agent finds the experiment YAML, updates it, commits “Update EXP-0003: added reagent X detail” and then tells the user “Noted: I added that detail to EXP-0003.” No extra confirmation needed because its a straightforward update.
• Implicit Confirmation: In cases where an action is reversible or minor (most git-tracked changes are reversible), the agent just does them. Users can always fix via another command if needed. This keeps the interaction flowing and avoids interrupting the user for permission frequently.
• When to Ask for Clarification: The agent will pause and ask the user only when:
• Its truly unsure how to proceed and the consequence of guessing wrong might be significant/confusing. For example, user says “schedule experiment next Monday” but there are two experiments that could be meant the agent might ask “Do you want to create a new experiment entry for next Monday, or schedule an existing one?”.
• A required piece of info is missing that Smart-Fill cannot confidently supply. E.g., user says “do X with antibody” but doesnt name the antibody, and multiple antibodies are possible. The agent might say: “Which antibody will you use? (e.g., Anti-ABC or Anti-XYZ)”.
• The users request is unusual or potentially dangerous (not likely in lab context, but if user asked to delete a project, the agent should confirm since thats destructive).
• Confidence Thresholds: The agents decision to auto-fill vs ask can be guided by confidence measures:
• If using OpenAI functions, the model itself might indicate uncertainty (“I think its X”). We can parse that. If not, we rely on our Smart-Fill scores. For example, if the top embedding match for a missing parameter has a high similarity and clearly fits, we auto-use it. If two matches are close or low similarity, we then ask.
• For numeric or scientific suggestions (like a concentration), if the agent finds conflicting values from sources, better to ask the user or at least present the suggestion as a question: “I assumed 10 µM as the concentration based on literature let me know if thats correct.”
• Auto-Confirmation of Actions: After an action, the agent does usually describe what it did (“Ive created the experiment entry with ID EXP-0007 and filled in the details.”). This serves as an implicit confirmation to the user that it interpreted the request correctly. The user can always say “Actually, change X…” if they notice something off. This design aligns with a helpful assistant that takes initiative yet remains responsive to corrections.
By minimizing explicit questions to the user, the workflow becomes efficient the student can rattle off a series of instructions and trust the agent to handle them. Only occasionally will the agent ping them for clarification. This reduces friction especially for routine tasks. Its akin to a real lab assistant who mostly knows what to do and only asks when absolutely necessary.
Of course, during initial deployment, well monitor if the agent maybe should ask more often in certain cases (to avoid assumptions). We can tune this by adjusting the agent prompt (for example, giving it guidelines on when to ask vs act).
Ensuring Reproducibility and Metadata Quality
Reproducibility is a top priority in lab work. Our system reinforces this in several ways:
• Comprehensive Metadata Capture: Every experiments YAML is structured to capture who, what, when, how, and results. By enforcing templates and using Smart-Fill to populate them, we ensure fields arent left blank. The agent will include as much detail as possible (including environmental conditions, instrument settings if mentioned, etc.). This addresses the concern that “details about experiments… are quickly forgotten unless they are written down” . The system diligently writes everything down in the notebook (YAML), so nothing relies on memory.
• Protocol Linking and Versioning: Experiments reference protocols by name and version. If a protocol is updated, a new version file can be created (and the old one kept). The experiment continues to point to the version it used. This way, years later one can see the exact procedure used. We could even have the agent automatically record the git commit hash of the protocol file at time of use (to absolutely pin the version). This might be overkill, but its an idea.
• Validation of Entries: The task-runner can include a validate function that runs after an experiment is marked completed to check that it has results and conclusion. Similarly for protocols: check that steps are not empty, etc. If somethings missing, tag the YAML or open an issue. E.g., if a student forgot to fill “conclusion” in an experiment, the system might open an issue “Please add conclusion for EXP-0007” or leave a TODO in the file. This ensures completeness before experiments are considered done.
• PI Review Workflow: By involving the PI via PRs or even periodic review of the logs, we introduce a human check. The PI might notice if something is odd (like an experiment missing a control) and can comment. The agent can then help the student address that (maybe via a new experiment for the control).
• Minimal Friction for Students: All the above is achieved with minimal extra work for students because the agent does the heavy lifting. The interface is just a chat. Students dont need to remember to fill every field if they forget, the agent either fills it or reminds them. The tedious parts of record-keeping (formatting, structuring, committing) are automated. This lowers the barrier to maintaining good records (one of the biggest challenges in research). The system essentially nudges users into good data practices by automating them.
• Reproducible Environment for Execution: If any code or analysis is part of experiments, the devcontainer ensures that running analysis scripts (if added to the repo) will yield the same results environment-wise. This goes beyond lab wet-work, but its worth noting for completeness: e.g., if an experiment includes an analysis Jupyter notebook, the container has the packages to run it, making even computational parts reproducible.
• Documentation for Users and PI: Well write a CONTRIBUTING.md or an onboarding doc for new students explaining this systems purpose: emphasize that its an electronic lab notebook and task manager, why writing everything (via the agent) is important, and how it benefits them (searchable history, easier report writing, etc.). Also a note to PIs on how to get their reports from it (maybe instruct on using GitHubs interface to filter by user or date, or to use the logs to compile results).
In summary, by combining structured data capture, automated suggestion, and integrated review, the system will greatly enhance the completeness and reliability of lab records. Students can focus on science rather than paperwork, while the PI can be confident that no key detail has been omitted from the records. As one system demonstrated, using structured YAML protocols can even drive automated lab equipment reliably ; in our case, it drives record-keeping and planning with the same rigor.
Documentation and Next Steps
Finally, we prepare documentation to ensure the system is maintainable and users can learn it quickly:
• User Guide (Lab Agent Guide): As mentioned, a Markdown guide explaining how to use the chat interface, with examples:
• e.g. “To create a protocol, just tell the agent e.g. Create a protocol for solution preparation. The agent will walk you through or auto-complete the details.”
• List of things the agent can do (create/edit/list/search).
• How to phrase questions vs commands.
• Troubleshooting: what to do if the agent seems stuck or makes a mistake (e.g., manually edit the file, or revert a commit, etc., and let the agent know).
• How the GitHub integration works (so theyre not surprised by auto commits or issues).
• Remind them to always push changes if they do anything manually.
• Developer Guide: Although the primary audience is lab users, we include some notes (maybe in the repository README or a separate DEV_NOTES.md) about the systems architecture for future maintainers. E.g., instructions to update the function schema if needed, or how to upgrade the embedding model, etc. Since this is a long-lived lab tool, eventually someone might need to tweak it (for example, if OpenAI API changes or if they switch to another LLM provider). Clear comments in code and a high-level doc will facilitate this.
• Resuming Work Documentation: In the README or Guide, explicitly mention that all progress is saved in the repository, and one can resume by reading TASKS.md and SESSION_LOG.md. Encourage committing these frequently (the agent will do so anyway). Essentially, “the system never forgets because it writes everything down”, so users should trust the logs more than their memory when resuming.
• GitHub Usage Documentation: A short section on how to use issues/PRs produced. For instance, if an issue is opened for them by the agent, they should know to close it once done or comment. If a PR is opened, they should know how to view the changes and merge if appropriate. Not all students might be familiar with GitHub PRs, so a brief intro could help (or link to GitHub docs).
• No External Servers: Document that the system runs fully in Codespaces and pushes to GitHub. If the labs Gitea is down or inaccessible, it doesnt affect using the agent (aside from mirror). And conversely, if Codespaces is down, one can still access the data on Gitea (but the agent wouldnt be running). This is more of an FYI for the PI about redundancy.
With all pieces in place repository structure, devcontainer, agent & runner, smart-fill, integration, and docs we will have a robust production-ready lab management system. It will have the following tangible outcomes:
• Folders and files scaffolded (protocols, experiments, etc., with templates).
• Working chat interface in Codespaces where the agent responds to lab commands.
• Example use case executed (perhaps in the README, illustrate creating a protocol and an experiment and show the resulting YAML and commit).
• Version control integration tested (ensuring commits and PRs happen correctly).
• Smart-Fill suggestions validated with a few test queries (maybe add a dummy protocol and see if it suggests it when querying).
Finally, after implementation, well likely do a dry run with a lab members actual experiment to fine-tune any issues. But the plan as detailed covers the blueprint to implement this step-by-step.
To conclude, this plan provides a comprehensive path to deploy the smart lab assistant in the-jordan-lab/docs. By capitalizing on modern LLM capabilities within a structured, containerized framework, we greatly streamline lab workflows while maintaining rigorous documentation standards. This meets the labs needs for completeness, reproducibility, and ease of use, transforming the GitHub repository into a living lab notebook maintained through natural conversation and intelligent automation.
Sources:
• Dev Containers for consistent Codespaces environments
• OpenAI function calling for structured, multi-step tool use
• YAML protocols as structured experiment scripts in automation
• Importance of embeddings in retrieval-augmented responses
• Labguru on centralizing experiments, protocols, and data for teamwork
• Need for detailed record-keeping in lab notebooks

403
Agent/lab.py Normal file
View File

@ -0,0 +1,403 @@
import os
import yaml
import shutil
import hashlib
import subprocess
from datetime import datetime
from typing import Dict, Any, List, Optional
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("lab")
def create_experiment_with_plate(
experiment_id: str,
aim: str,
project: str,
researcher: str,
plate: Dict[str, Any],
sample_preparation: Optional[Dict[str, Any]] = None,
downstream_application: Optional[Dict[str, Any]] = None,
status: str = "in_progress"
) -> str:
"""
Create a new experiment with plate layout information.
Args:
experiment_id: Unique identifier for the experiment (EXP-YYYYMMDD format)
aim: Brief description of the experiment's goal
project: Project this experiment belongs to
researcher: GitHub username or name of the researcher
plate: Dictionary with plate ID and layout information
sample_preparation: Optional dictionary with sample preparation details
downstream_application: Optional dictionary with downstream application details
status: Current status of the experiment (default: in_progress)
Returns:
Path to the created experiment file
"""
logger.info(f"Creating experiment {experiment_id} for {researcher}")
# Create basic experiment structure
experiment = {
"experiment_id": experiment_id,
"aim": aim,
"project": project,
"researcher": researcher,
"status": status,
"created": datetime.now().strftime("%Y-%m-%d"),
"plate": plate,
"tasks": []
}
# Add optional sections if provided
if sample_preparation:
experiment["sample_preparation"] = sample_preparation
if downstream_application:
experiment["downstream_application"] = downstream_application
# Create data directory for this experiment if it doesn't exist
data_dir = os.path.join("Data", experiment_id)
os.makedirs(data_dir, exist_ok=True)
# Save the experiment file
experiment_file = f"Experiments/{experiment_id}.yaml"
with open(experiment_file, "w") as f:
yaml.dump(experiment, f, sort_keys=False)
logger.info(f"Experiment file created: {experiment_file}")
return experiment_file
def add_experiment_task(experiment_id: str, task_title: str) -> Dict[str, Any]:
"""
Add a task to an experiment and create a corresponding GitHub Issue.
Args:
experiment_id: ID of the experiment to add task to
task_title: Title of the task/Issue
Returns:
Dictionary with the created issue details
"""
logger.info(f"Adding task '{task_title}' to experiment {experiment_id}")
# Create GitHub Issue
cmd = f'gh issue create --title "{task_title}" --body "Experiment: {experiment_id}\n\nTask for experiment {experiment_id}"'
result = subprocess.run(cmd, shell=True, capture_output=True, text=True)
if result.returncode != 0:
logger.error(f"Failed to create GitHub Issue: {result.stderr}")
raise Exception(f"Failed to create GitHub Issue: {result.stderr}")
# Extract issue number from URL
issue_url = result.stdout.strip()
issue_number = issue_url.split("/")[-1]
# Add issue to project board
add_to_project_cmd = f'gh issue edit {issue_number} --add-project "Lab Tasks Board"'
subprocess.run(add_to_project_cmd, shell=True)
# Update experiment YAML
experiment_file = f"Experiments/{experiment_id}.yaml"
with open(experiment_file, "r") as f:
experiment = yaml.safe_load(f)
# Add task to tasks array
task = {
"id": issue_number,
"title": task_title,
"status": "open"
}
if "tasks" not in experiment:
experiment["tasks"] = []
experiment["tasks"].append(task)
with open(experiment_file, "w") as f:
yaml.dump(experiment, f, sort_keys=False)
logger.info(f"Added task {issue_number} to experiment {experiment_id}")
return task
def record_data(exp_id: str, file_path: str, data_type: str) -> Dict[str, Any]:
"""
Record data file for an experiment. This function:
1. Computes checksum of the file
2. Copies file to Data/{exp_id}/ if not already there
3. Adds entry to the experiment's YAML data section
Args:
exp_id: Experiment ID
file_path: Path to the data file
data_type: Type of data (e.g., qPCR, flow_cytometry, imaging)
Returns:
The data entry added to the experiment
"""
logger.info(f"Recording {data_type} data for experiment {exp_id}")
# Verify the experiment exists
experiment_file = f"Experiments/{exp_id}.yaml"
if not os.path.exists(experiment_file):
raise FileNotFoundError(f"Experiment {exp_id} not found")
# Make sure the data directory exists
data_dir = os.path.join("Data", exp_id)
os.makedirs(data_dir, exist_ok=True)
# Compute file checksum
with open(file_path, "rb") as f:
file_data = f.read()
sha256 = hashlib.sha256(file_data).hexdigest()
# Get just the filename
filename = os.path.basename(file_path)
# Construct target path in the data directory
target_path = os.path.join(data_dir, filename)
# Copy file if it's not already in the data directory
if os.path.abspath(file_path) != os.path.abspath(target_path):
shutil.copy2(file_path, target_path)
logger.info(f"Copied data file to {target_path}")
# Update the experiment YAML
with open(experiment_file, "r") as f:
experiment = yaml.safe_load(f)
# Create the data entry
data_entry = {
"path": f"Data/{exp_id}/{filename}",
"type": data_type,
"sha256": sha256,
"added": datetime.now().strftime("%Y-%m-%d")
}
# Add to data array
if "data" not in experiment:
experiment["data"] = []
experiment["data"].append(data_entry)
# Save the updated experiment file
with open(experiment_file, "w") as f:
yaml.dump(experiment, f, sort_keys=False)
# Commit the changes
commit_cmd = f'git add "{target_path}" "{experiment_file}" && git commit -m "Data: add {data_type} result for {exp_id}"'
subprocess.run(commit_cmd, shell=True)
logger.info(f"Recorded data file {filename} for experiment {exp_id}")
return data_entry
def close_task(issue_number: int) -> None:
"""
Close a GitHub Issue and update the corresponding experiment task status.
Args:
issue_number: The GitHub Issue number to close
"""
logger.info(f"Closing task (issue #{issue_number})")
# Close the GitHub issue
close_cmd = f'gh issue close {issue_number}'
result = subprocess.run(close_cmd, shell=True)
if result.returncode != 0:
logger.error(f"Failed to close GitHub Issue #{issue_number}")
return
# Find which experiment this issue belongs to
experiments_dir = "Experiments"
for filename in os.listdir(experiments_dir):
if not filename.endswith(".yaml"):
continue
file_path = os.path.join(experiments_dir, filename)
with open(file_path, "r") as f:
experiment = yaml.safe_load(f)
if "tasks" not in experiment:
continue
# Check if this issue is in the tasks
for i, task in enumerate(experiment["tasks"]):
if str(task.get("id")) == str(issue_number):
# Update task status
experiment["tasks"][i]["status"] = "closed"
# Save the updated experiment
with open(file_path, "w") as f:
yaml.dump(experiment, f, sort_keys=False)
logger.info(f"Updated task status in experiment {experiment['experiment_id']}")
# Commit the change
commit_cmd = f'git add "{file_path}" && git commit -m "Task: close issue #{issue_number} for {experiment["experiment_id"]}"'
subprocess.run(commit_cmd, shell=True)
return
logger.warning(f"Could not find task {issue_number} in any experiment")
def finish_experiment(exp_id: str) -> bool:
"""
Finish an experiment by:
1. Verifying all tasks are closed
2. Checking data exists
3. Setting status to completed
4. Updating SESSION_LOG.md
Args:
exp_id: Experiment ID to finish
Returns:
True if experiment was successfully completed, False otherwise
"""
logger.info(f"Attempting to finish experiment {exp_id}")
# Verify the experiment exists
experiment_file = f"Experiments/{exp_id}.yaml"
if not os.path.exists(experiment_file):
logger.error(f"Experiment {exp_id} not found")
return False
# Load the experiment
with open(experiment_file, "r") as f:
experiment = yaml.safe_load(f)
# Check if all tasks are closed
if "tasks" in experiment:
open_tasks = [task for task in experiment["tasks"] if task.get("status") != "closed"]
if open_tasks:
task_ids = [task.get("id") for task in open_tasks]
logger.warning(f"Cannot finish experiment: open tasks remain: {task_ids}")
return False
# Check if data exists
if "data" not in experiment or not experiment["data"]:
logger.warning(f"Cannot finish experiment: no data recorded")
return False
# Update status to completed
experiment["status"] = "completed"
experiment["completed"] = datetime.now().strftime("%Y-%m-%d")
# Save the updated experiment
with open(experiment_file, "w") as f:
yaml.dump(experiment, f, sort_keys=False)
# Update SESSION_LOG.md
session_log_path = "SESSION_LOG.md"
if not os.path.exists(session_log_path):
with open(session_log_path, "w") as f:
f.write("# Lab Session Log\n\n")
with open(session_log_path, "a") as f:
f.write(f"\n## {datetime.now().strftime('%Y-%m-%d')} - {experiment['researcher']}\n")
f.write(f"- Completed experiment {exp_id}: {experiment['aim']}\n")
f.write(f"- Data files: {len(experiment['data'])}\n")
# Commit the changes
commit_cmd = f'git add "{experiment_file}" "{session_log_path}" && git commit -m "Experiment: completed {exp_id}"'
subprocess.run(commit_cmd, shell=True)
logger.info(f"Successfully completed experiment {exp_id}")
return True
def list_experiments(status_filter=None):
"""
List all experiments, optionally filtered by status.
Args:
status_filter: Optional filter for experiment status ('in_progress', 'completed', etc.)
Returns:
List of experiment dictionaries
"""
experiments = []
experiments_dir = "Experiments"
if not os.path.exists(experiments_dir):
return []
for filename in os.listdir(experiments_dir):
if not filename.endswith(".yaml"):
continue
file_path = os.path.join(experiments_dir, filename)
with open(file_path, "r") as f:
experiment = yaml.safe_load(f)
if status_filter is None or experiment.get("status") == status_filter:
experiments.append(experiment)
return experiments
def record_cli():
"""Entry point for lab-record CLI command"""
import argparse
parser = argparse.ArgumentParser(description="Record data for an experiment")
parser.add_argument("--exp", required=True, help="Experiment ID")
parser.add_argument("--file", required=True, help="Path to data file")
parser.add_argument("--type", required=True, help="Type of data")
args = parser.parse_args()
try:
result = record_data(args.exp, args.file, args.type)
print(f"Data recorded successfully: {result['path']}")
except Exception as e:
print(f"Error: {e}")
exit(1)
def main():
"""Entry point for lab CLI command"""
import argparse
parser = argparse.ArgumentParser(description="Lab Management CLI")
subparsers = parser.add_subparsers(dest="command", help="Command to run")
# Record data command
record_parser = subparsers.add_parser("record", help="Record data for an experiment")
record_parser.add_argument("--exp", required=True, help="Experiment ID")
record_parser.add_argument("--file", required=True, help="Path to data file")
record_parser.add_argument("--type", required=True, help="Type of data")
# Close task command
close_parser = subparsers.add_parser("close-task", help="Close a task/issue")
close_parser.add_argument("--issue", type=int, required=True, help="Issue number to close")
# Finish experiment command
finish_parser = subparsers.add_parser("finish", help="Mark an experiment as completed")
finish_parser.add_argument("--exp", required=True, help="Experiment ID to finish")
# List experiments command
list_parser = subparsers.add_parser("list", help="List experiments")
list_parser.add_argument("--status", help="Filter by status")
# Initialize command (mostly for documentation, actual impl. is in init_extensions.py)
init_parser = subparsers.add_parser("init-extensions", help="Initialize extensions and environment")
# Parse arguments and execute
args = parser.parse_args()
if args.command == "record":
record_data(args.exp, args.file, args.type)
elif args.command == "close-task":
close_task(args.issue)
elif args.command == "finish":
finish_experiment(args.exp)
elif args.command == "list":
experiments = list_experiments(args.status)
for exp in experiments:
print(f"{exp.get('experiment_id', 'No ID')} - {exp.get('aim', 'No description')} - {exp.get('status', 'unknown')}")
elif args.command == "init-extensions":
print("Please run 'lab-init-extensions' instead")
else:
parser.print_help()
# Command-line interface
if __name__ == "__main__":
main()

21
Agent/setup.py Normal file
View File

@ -0,0 +1,21 @@
from setuptools import setup, find_packages
setup(
name="lab",
version="0.1",
packages=find_packages(),
install_requires=[
"pyyaml",
"chromadb",
"biopython",
"gitpython",
],
entry_points={
"console_scripts": [
"lab=Agent.lab:main",
"lab-record=Agent.lab:record_cli",
"lab-init-extensions=Agent.init_extensions:main",
],
},
python_requires=">=3.8",
)

View File

@ -1,42 +0,0 @@
experiment_id: EXP-0225
project: Post-transcriptional regulation by Ybx1
subproject: mRNA stability measurement after Ybx1 knockdown
title: Initial Ybx1 knockdown effect on mRNA stability (reverse transfection)
date: 2025-05-06
researcher: Jack Zhao
protocol: Ybx1 knockdown mRNA stability assay v1.0
materials:
siRNA_Ybx1: Dharmacon ON-TARGETplus Human YBX1 siRNA SMARTpool (10 nM)
siRNA_Control: Dharmacon ON-TARGETplus Non-targeting Control Pool (10 nM)
Lipofectamine: RNAiMAX (1.5 µL per well)
Opti-MEM: 100 µL per well (for complexing)
Actinomycin_D: 5 µg/mL (Sigma)
Cell_line: HEK293T cells (3 × 10^5 cells per well)
parameters:
transfection_method: Reverse transfection
plate_type: 24-well
total_wells: 24 (4 timepoints x 2 conditions x 3 replicates)
qPCR_targets: Ybx1, Myc, p53, GAPDH (control), 18S rRNA (control)
actinomycin_D_time_points: 0, 2, 4, 8 hours
expected_duration: 4 days (May 6-9, 2025)
results:
observations: "TBD"
data_location: "Data/RNA_stability/EXP-0225/"
status: planned
notes: |
Timeline:
- Day 1 (May 6, 2025): Reverse transfection setup
- Day 3 (May 8, 2025): Confirm knockdown by qPCR
- Day 3 (May 8, 2025): Add actinomycin D and collect timepoints
- Day 4 (May 9, 2025): RNA extraction, cDNA synthesis
- RNA collection and qPCR analysis for days 3-4 by Jack Zhao
Special considerations:
- We will use reverse transfection to improve efficiency and reduce handling steps
- Actinomycin D is toxic; use proper PPE and dispose of waste in designated containers
- All timepoints after actinomycin D addition should be processed rapidly to minimize RNA degradation
- Plan to freeze samples at -80°C if all timepoints cannot be processed on the same day
Next steps:
- May 10, 2025: Data analysis and calculation of mRNA half-lives
- Based on results, plan follow-up experiments with expanded gene set or modified conditions

View File

@ -1,16 +0,0 @@
experiment_id: EXP-0002
project: Tumor Growth Study 2025
title: Staining Tumor Cells with Anti-XYZ
date: 2025-05-10
researcher: Alice Smith
protocol: Cell Staining Protocol (v1.0)
materials:
Antibody: Anti-XYZ (lot #12345)
Cell line: HeLa
parameters:
Cell_count: 1e5
Incubation_time: 60 # minutes
results:
images: ["Data/Images/exp0002_image1.png", "Data/Images/exp0002_image2.png"]
observations: "Strong fluorescence observed in nucleus."
status: completed

View File

@ -1,29 +0,0 @@
experiment_id: EXP-0100
project: RNAi Knockdown Screen
title: 24-well Plate siRNA Transfection (4 siRNAs, RNAiMAX)
date: 2024-06-08
researcher: Lab Agent
protocol: 24-well Plate siRNA Transfection (RNAiMAX) v1.0
materials:
siRNA#1: 10 nM final
siRNA#2: 10 nM final
siRNA#3: 10 nM final
siNC: 10 nM final (negative control)
Lipofectamine RNAiMAX: 1.5 µL/well
Opti-MEM: 100 µL/well (for complexing)
Cells: 5x10^4/well in 24-well plate
parameters:
plate_type: 24-well
total_wells: 24
siRNA_per_well: 10 pmol
rnaimax_per_well: 1.5 µL
opti_mem_per_well: 100 µL
incubation_time: 24-72 hours
results:
images: []
observations: "TBD"
status: planned
notes: |
- siNC is a non-targeting negative control siRNA.
- Each siRNA transfected in separate wells, following standard RNAiMAX protocol.
- See protocol PROT-0020 for detailed steps and solution preparation.

View File

@ -1,21 +0,0 @@
# Experiments Folder
This folder contains records of individual experiments or lab sessions as YAML files.
## How to Add a New Experiment
- Use the experiment_template.yaml in Templates/ as a starting point.
- Name your file with a unique ID or date, e.g., `2025-05-10_cell_staining_Alice.yaml`.
- Fill in all required fields: experiment_id, project, title, date, researcher, protocol, materials, parameters, results, status.
- Submit via the lab agent or manually, then commit to the repository.
## YAML Schema Reference
See `Templates/experiment_template.yaml` for the required structure.
## Example Experiment
See `2025-05-10_cell_staining_Alice.yaml` in this folder for a complete example of an experiment file. Use it as a reference when creating new experiments.
### Example Usage
To add a new experiment, you can:
1. Use the lab agent and describe your experiment in natural language (e.g., "Log a cell staining experiment for Alice on May 10, 2025").
2. The agent will generate a YAML file similar to `2025-05-10_cell_staining_Alice.yaml`.
3. Review and edit as needed, then commit the file.

View File

@ -1,12 +0,0 @@
# Projects Folder
This folder contains project records, grouping related experiments under broad project titles.
## How to Add a New Project
- Use the project_template.yaml in Templates/ as a starting point.
- Name your file descriptively, e.g., `tumor_growth_2025.yaml`.
- Fill in all required fields: title, description, date_started, lead, team_members, associated_protocols, experiments, notes.
- Submit via the lab agent or manually, then commit to the repository.
## YAML Schema Reference
See `Templates/project_template.yaml` for the required structure.

View File

@ -1,21 +0,0 @@
# Experiments Folder
This folder contains records of individual experiments or lab sessions as YAML files.
## How to Add a New Experiment
- Use the experiment_template.yaml in Templates/ as a starting point.
- Name your file with a unique ID or date, e.g., `2025-05-10_cell_staining_Alice.yaml`.
- Fill in all required fields: experiment_id, project, title, date, researcher, protocol, materials, parameters, results, status.
- Submit via the lab agent or manually, then commit to the repository.
## YAML Schema Reference
See `Templates/experiment_template.yaml` for the required structure.
## Example Experiment
See `2025-05-10_cell_staining_Alice.yaml` in this folder for a complete example of an experiment file. Use it as a reference when creating new experiments.
### Example Usage
To add a new experiment, you can:
1. Use the lab agent and describe your experiment in natural language (e.g., "Log a cell staining experiment for Alice on May 10, 2025").
2. The agent will generate a YAML file similar to `2025-05-10_cell_staining_Alice.yaml`.
3. Review and edit as needed, then commit the file.

View File

@ -1,26 +0,0 @@
title: mRNA stability measurement after Ybx1 knockdown
description: Investigation of mRNA stability changes in selected genes after Ybx1 knockdown, using siRNA and actinomycin D to assess post-transcriptional regulation
date_started: 2025-05-06
lead: Dr. Jim Jordan
team_members:
- Dr. Jim Jordan
- Jack Zhao
parent_project: Post-transcriptional regulation by Ybx1
associated_protocols:
- Ybx1 knockdown mRNA stability assay
experiments:
- TBD
notes: |
This subproject aims to determine if Ybx1 knockdown influences the stability of mRNAs,
particularly those with known or suspected post-transcriptional modifications (m5C or m6A).
Specific aims:
1. Establish efficient Ybx1 knockdown using siRNA
2. Measure half-lives of selected target mRNAs in control vs. Ybx1 knockdown conditions
3. Identify genes most affected by Ybx1 depletion
4. Correlate mRNA stability changes with RNA modifications if possible
Initial target genes include Myc, p53, and other transcripts with known post-transcriptional regulation.
This experiment will employ actinomycin D to block transcription, followed by time-course
sampling and RT-qPCR to measure the decay rates of specific mRNAs.

View File

@ -1,17 +0,0 @@
title: Post-transcriptional regulation by Ybx1
description: Project to understand how Ybx1 influences post-transcriptional regulation of gene expression, with emphasis on mRNA stability
date_started: 2025-05-06
lead: Dr. Jim Jordan
team_members:
- Dr. Jim Jordan
- Jack Zhao
associated_protocols:
- siRNA transfection protocol
- Actinomycin D mRNA stability protocol
experiments:
- TBD
notes: |
This project will investigate the role of Ybx1 in regulating mRNA stability,
with particular focus on genes involved in cellular processes. Ybx1 has been
shown to interact with m5C and m6A modifications on mRNAs, suggesting a role
in post-transcriptional gene regulation.

View File

@ -0,0 +1,187 @@
# EXP-0225 mRNA Stability Analysis Script
# Analysis of YBX1 knockdown effect on mRNA half-life in Huh7 and HepG2 cells
# Authors: james-m-jordan, jack-zhao
# Date: 2025-05-12
# Load required libraries
library(tidyverse)
library(rtracklayer)
library(ggplot2)
library(cowplot)
library(DESeq2)
# Set paths
experiment_id <- "EXP-0225"
data_dir <- file.path("Data", experiment_id, "raw")
output_dir <- file.path("Data", experiment_id, "figures")
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
# Load qPCR data
# Assuming format: sample, gene, timepoint, Ct, cell_line, treatment
qpcr_data <- read_excel(file.path(data_dir, "timecourse_qPCR.xlsx"))
# Load RNA concentration data
rna_data <- read_excel(file.path(data_dir, "RNA_concentrations.xlsx"))
# Define reference genes
ref_genes <- c("GAPDH", "ACTB")
target_genes <- c("IL6", "MYC")
# Calculate delta Ct (normalize to reference genes)
calculate_delta_ct <- function(df) {
# First calculate average Ct of reference genes per sample
ref_data <- df %>%
filter(gene %in% ref_genes) %>%
group_by(sample, timepoint, cell_line, treatment) %>%
summarize(ref_ct = mean(Ct), .groups = "drop")
# Calculate delta Ct for all genes
df %>%
left_join(ref_data, by = c("sample", "timepoint", "cell_line", "treatment")) %>%
mutate(delta_ct = Ct - ref_ct)
}
# Calculate delta-delta Ct (relative to t=0)
calculate_relative_expression <- function(df) {
# First get delta Ct values
delta_ct_df <- calculate_delta_ct(df)
# For each gene, cell line, and treatment, get the t=0 value
t0_values <- delta_ct_df %>%
filter(timepoint == "0h") %>%
select(gene, cell_line, treatment, delta_ct) %>%
rename(delta_ct_0 = delta_ct)
# Calculate delta-delta Ct and relative expression (2^-ddCt)
delta_ct_df %>%
left_join(t0_values, by = c("gene", "cell_line", "treatment")) %>%
mutate(
delta_delta_ct = delta_ct - delta_ct_0,
rel_expr = 2^(-delta_delta_ct),
ln_rel_expr = log(rel_expr)
)
}
# Function to calculate mRNA half-life
calculate_half_life <- function(expr_data) {
# Convert timepoint to numeric hours
expr_data <- expr_data %>%
mutate(
hours = case_when(
timepoint == "0h" ~ 0,
timepoint == "1h" ~ 1,
timepoint == "2h" ~ 2,
timepoint == "4h" ~ 4,
timepoint == "8h" ~ 8,
TRUE ~ NA_real_
)
)
# Calculate half-life for each gene, cell line, and treatment
expr_data %>%
filter(!is.na(hours)) %>%
group_by(gene, cell_line, treatment) %>%
do({
# Fit linear model: ln(expression) ~ time
model <- lm(ln_rel_expr ~ hours, data = .)
# Extract slope (k)
k <- coef(model)[2]
# Calculate half-life: t1/2 = ln(2)/|k|
t_half <- log(2)/abs(k)
# Return results
tibble(
slope = k,
half_life = t_half,
r_squared = summary(model)$r.squared,
p_value = summary(model)$coefficients[2,4]
)
})
}
# PLACEHOLDER: Data processing steps (to be filled with actual data)
# 1. Read and process data
# normalized_data <- calculate_relative_expression(qpcr_data)
# 2. Calculate half-lives
# half_lives <- calculate_half_life(normalized_data)
# 3. Compare half-lives: control vs YBX1 knockdown
# half_life_comparison <- half_lives %>%
# select(gene, cell_line, treatment, half_life) %>%
# pivot_wider(
# names_from = treatment,
# values_from = half_life,
# names_prefix = "t_half_"
# ) %>%
# mutate(
# ratio = t_half_siYBX1 / t_half_siCTRL,
# percent_change = (ratio - 1) * 100
# )
# PLACEHOLDER: Plot generation (to be filled with actual data)
# Plot example: mRNA decay curves for each gene in Huh7 cells
plot_decay_curves <- function(data, cell_line_to_plot) {
# Filter for the specific cell line and target genes
plot_data <- data %>%
filter(cell_line == cell_line_to_plot, gene %in% target_genes)
# Create decay plot
ggplot(plot_data, aes(x = hours, y = ln_rel_expr, color = treatment, shape = treatment)) +
geom_point(size = 3) +
geom_smooth(method = "lm", se = TRUE, alpha = 0.2) +
facet_wrap(~gene, scales = "free_y") +
labs(
title = paste("mRNA Decay Curves in", cell_line_to_plot, "Cells"),
x = "Time after Actinomycin D (hours)",
y = "ln(Relative Expression)",
color = "Treatment",
shape = "Treatment"
) +
scale_color_manual(values = c("siCTRL" = "blue", "siYBX1" = "red")) +
theme_cowplot() +
theme(legend.position = "bottom")
}
# PLACEHOLDER: Save results
# Example code for saving results
save_results <- function(half_life_df) {
# Save summary table
write_csv(half_life_df, file.path(output_dir, "half_life_summary.csv"))
# Create comparison table for manuscript
comparison_table <- half_life_df %>%
select(gene, cell_line, treatment, half_life) %>%
pivot_wider(
names_from = c(cell_line, treatment),
values_from = half_life
)
write_csv(comparison_table, file.path(output_dir, "half_life_comparison_table.csv"))
}
# PLACEHOLDER: Main execution (commented out until real data is available)
# Process and analyze data
# normalized_data <- calculate_relative_expression(qpcr_data)
# half_lives <- calculate_half_life(normalized_data)
# Generate and save plots
# huh7_decay_plot <- plot_decay_curves(normalized_data, "Huh7")
# ggsave(file.path(output_dir, "Huh7_decay_curves.pdf"), huh7_decay_plot, width = 10, height = 8)
#
# hepg2_decay_plot <- plot_decay_curves(normalized_data, "HepG2")
# ggsave(file.path(output_dir, "HepG2_decay_curves.pdf"), hepg2_decay_plot, width = 10, height = 8)
# Save numerical results
# save_results(half_lives)
# Final output
cat("
========================================
EXP-0225 mRNA Stability Analysis Results
========================================
# To be updated with actual results after data collection
Analysis completed: 2025-05-12
Results saved to:", output_dir, "\n")

View File

@ -0,0 +1,217 @@
# EXP-0226 Co-IP Western Blot Quantification Script
# Analysis of YBX1-CEBPA protein interaction in early adipogenesis
# Authors: james-m-jordan, linda-onsei
# Date: 2025-05-11
# Load required libraries
library(tidyverse)
library(readxl)
library(ggplot2)
library(cowplot)
library(rstatix)
# Set paths
experiment_id <- "EXP-0226"
data_dir <- file.path("Data", experiment_id, "raw")
output_dir <- file.path("Data", experiment_id, "figures")
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)
# PLACEHOLDER: Load band intensity data
# This would typically come from ImageJ/FIJI quantification of Western blot TIFFs
# For now, we'll create a placeholder data structure
# Function to read ImageJ quantification data
# In a real scenario, this would parse data exported from ImageJ
read_imagej_data <- function(filepath) {
# If real data exists, uncomment and use:
# read_csv(filepath)
# For now, simulate with placeholder data
tibble(
lane = 1:12,
condition = rep(c("Control", "Control", "Control", "Adipogenic", "Adipogenic", "Adipogenic"), 2),
antibody = c(rep("YBX1_IP", 6), rep("CEBPA_IP", 6)),
sample_type = rep(c("Input", "IP", "IgG"), 4),
intensity = c(
# YBX1 IP probed for CEBPA
1000, 120, 10, # Control (Input, IP, IgG)
1200, 450, 15, # Adipogenic (Input, IP, IgG)
# CEBPA IP probed for YBX1
900, 100, 5, # Control (Input, IP, IgG)
950, 320, 8 # Adipogenic (Input, IP, IgG)
)
)
}
# Process and analyze Co-IP data
analyze_coip_data <- function(df) {
# Background subtraction (IgG control)
df_bg <- df %>%
group_by(condition, antibody) %>%
mutate(
# Find IgG value for this group
igg_intensity = intensity[sample_type == "IgG"],
# Subtract IgG background
corrected_intensity = intensity - igg_intensity,
# Set negative values to zero
corrected_intensity = ifelse(corrected_intensity < 0, 0, corrected_intensity)
)
# Calculate enrichment (IP signal relative to input)
df_enrichment <- df_bg %>%
group_by(condition, antibody) %>%
mutate(
# Find input value for this group
input_intensity = corrected_intensity[sample_type == "Input"],
# Calculate enrichment as IP / Input
enrichment = corrected_intensity / input_intensity,
# For fold change calculations later
IP_intensity = corrected_intensity[sample_type == "IP"]
) %>%
filter(sample_type == "IP") %>% # Only keep IP samples for further analysis
ungroup()
# Calculate fold change in interaction (Adipogenic vs Control)
fold_changes <- df_enrichment %>%
group_by(antibody) %>%
summarize(
control_enrichment = enrichment[condition == "Control"],
adipogenic_enrichment = enrichment[condition == "Adipogenic"],
fold_change = adipogenic_enrichment / control_enrichment,
percent_increase = (fold_change - 1) * 100
)
# Prepare and return results
list(
raw_data = df,
background_corrected = df_bg,
enrichment = df_enrichment,
fold_changes = fold_changes
)
}
# Generate plots
create_coip_plots <- function(results) {
# Extract data
enrichment_data <- results$enrichment
# Bar plot of YBX1-CEBPA interaction by condition
p1 <- ggplot(enrichment_data, aes(x = condition, y = enrichment, fill = condition)) +
geom_bar(stat = "identity", width = 0.6) +
facet_wrap(~antibody, scales = "free_y",
labeller = labeller(antibody = c(
"YBX1_IP" = "YBX1 IP (probed for CEBPα)",
"CEBPA_IP" = "CEBPα IP (probed for YBX1)"
))) +
labs(
title = "YBX1-CEBPα Interaction in 3T3 Cells",
subtitle = "With or without adipogenic stimulation (24h)",
x = NULL,
y = "Relative Enrichment (IP/Input)"
) +
scale_fill_manual(values = c("Control" = "#99BBDD", "Adipogenic" = "#FF7755")) +
theme_cowplot() +
theme(
legend.position = "bottom",
strip.background = element_rect(fill = "white"),
strip.text = element_text(face = "bold")
)
# Fold change summary
fold_change_data <- results$fold_changes
p2 <- ggplot(fold_change_data, aes(x = antibody, y = fold_change, fill = antibody)) +
geom_bar(stat = "identity", width = 0.6) +
geom_hline(yintercept = 1, linetype = "dashed", color = "gray50") +
labs(
title = "Fold Change in YBX1-CEBPα Interaction",
subtitle = "Adipogenic vs Control",
x = NULL,
y = "Fold Change (Adipogenic/Control)"
) +
scale_x_discrete(labels = c(
"YBX1_IP" = "YBX1 IP\n(probed for CEBPα)",
"CEBPA_IP" = "CEBPα IP\n(probed for YBX1)"
)) +
scale_fill_manual(values = c("YBX1_IP" = "#3377BB", "CEBPA_IP" = "#DD5544")) +
theme_cowplot() +
theme(legend.position = "none")
# Return plot objects
list(
interaction_by_condition = p1,
fold_change = p2
)
}
# Create a summary table
create_summary_table <- function(results) {
# Extract fold change data
fold_data <- results$fold_changes
# Create a formatted table
summary_table <- fold_data %>%
mutate(
Antibody = case_when(
antibody == "YBX1_IP" ~ "YBX1 IP (probed for CEBPα)",
antibody == "CEBPA_IP" ~ "CEBPα IP (probed for YBX1)"
),
Control = round(control_enrichment, 2),
Adipogenic = round(adipogenic_enrichment, 2),
`Fold Change` = round(fold_change, 2),
`% Increase` = round(percent_increase, 1)
) %>%
select(Antibody, Control, Adipogenic, `Fold Change`, `% Increase`)
# Return the formatted table
summary_table
}
# Main execution
# PLACEHOLDER: In a real scenario, we would load actual data from ImageJ quantification files
# imagej_data_path <- file.path(data_dir, "western_blot_quantification.csv")
# band_data <- read_imagej_data(imagej_data_path)
# For demonstration, use our simulated data
band_data <- read_imagej_data(NULL)
# Analyze the data
results <- analyze_coip_data(band_data)
# Create plots
plots <- create_coip_plots(results)
# Save plots
# ggsave(file.path(output_dir, "YBX1_CEBPA_interaction.pdf"), plots$interaction_by_condition, width = 8, height = 6)
# ggsave(file.path(output_dir, "YBX1_CEBPA_fold_change.pdf"), plots$fold_change, width = 6, height = 5)
# Create summary table
summary_table <- create_summary_table(results)
# Print summary
cat("\n")
cat("========================================\n")
cat("EXP-0226 YBX1-CEBPα Interaction Analysis\n")
cat("========================================\n\n")
cat("Experiment: YBX1-CEBPA Protein Interaction in Early Adipogenesis\n")
cat("Date: 2025-05-11\n")
cat("Researchers: james-m-jordan, linda-onsei\n\n")
cat("SUMMARY OF RESULTS (PLACEHOLDER DATA):\n\n")
print(summary_table)
cat("\nNotes:\n")
cat("- Both YBX1 and CEBPα show increased interaction after adipogenic induction\n")
cat("- The interaction appears to be reciprocal and specific (minimal IgG background)\n")
cat("- For actual results, replace the placeholder data with real ImageJ quantification\n\n")
cat("Plots saved to:", output_dir, "\n")
cat("========================================\n")
# IMPORTANT NOTES FOR REAL ANALYSIS:
# 1. Replace the simulated data with actual ImageJ/FIJI quantification of Western blots
# 2. Consider adding statistical analysis (t-tests between conditions)
# 3. Uncomment the ggsave commands to save the plots
# 4. Consider additional normalization strategies if needed (e.g., for input variation)

View File

@ -1,295 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Histidine Interface Visualization\n",
"\n",
"This notebook visualizes histidine-mediated cation-\u03c0 and \u03c0-\u03c0 interactions in protein structures."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Import required packages\n",
"import py3Dmol\n",
"import os\n",
"import tempfile\n",
"from Bio import PDB\n",
"from IPython.display import HTML, display\n",
"import glob"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Constants for visualization\n",
"CHAIN_A_SURFACE = '#4e79a7' # Darker blue\n",
"CHAIN_A_STICK = '#85b0d5' # Lighter blue\n",
"CHAIN_A_LABEL = '#2c4e6f' # Dark blue for label text\n",
"\n",
"CHAIN_B_SURFACE = '#f2be2b' # Gold\n",
"CHAIN_B_STICK = '#f2be2b' # Same as surface\n",
"CHAIN_B_LABEL = '#8B4513' # Dark brown\n",
"\n",
"# Amino acid mapping\n",
"ONE_LETTER_MAP = {\n",
" 'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D',\n",
" 'CYS': 'C', 'GLN': 'Q', 'GLU': 'E', 'GLY': 'G',\n",
" 'HIS': 'H', 'ILE': 'I', 'LEU': 'L', 'LYS': 'K',\n",
" 'MET': 'M', 'PHE': 'F', 'PRO': 'P', 'SER': 'S',\n",
" 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'\n",
"}\n",
"\n",
"# Residue type definitions\n",
"CATION_RES = {'ARG', 'LYS', 'HIS'}\n",
"AROMATIC_RES = {'PHE', 'TYR', 'TRP', 'HIS'}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def convert_cif_to_pdb(cif_file):\n",
" \"\"\"Convert a CIF file to PDB format using BioPython.\"\"\"\n",
" try:\n",
" fd, temp_pdb = tempfile.mkstemp(suffix=\".pdb\")\n",
" os.close(fd)\n",
" parser = PDB.MMCIFParser(QUIET=True)\n",
" structure = parser.get_structure(\"structure\", cif_file)\n",
" io = PDB.PDBIO()\n",
" io.set_structure(structure)\n",
" io.save(temp_pdb)\n",
" return temp_pdb\n",
" except Exception as e:\n",
" print(f\"Error converting {cif_file} to PDB: {e}\")\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_sidechain_top_atom(residue):\n",
" \"\"\"Get the top atom of a residue's sidechain for visualization.\"\"\"\n",
" if residue.get_resname() == 'HIS':\n",
" return residue['CE1']\n",
" elif residue.get_resname() in {'PHE', 'TYR'}:\n",
" return residue['CZ']\n",
" elif residue.get_resname() == 'TRP':\n",
" return residue['CH2']\n",
" elif residue.get_resname() == 'ARG':\n",
" return residue['CZ']\n",
" elif residue.get_resname() == 'LYS':\n",
" return residue['NZ']\n",
" return None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def find_histidine_pairs(chain_a, chain_b, distance_cutoff=5.0):\n",
" \"\"\"Identify cation\u2013\u03c0 or \u03c0\u2013\u03c0 interactions with at least one HIS residue.\"\"\"\n",
" pairs = []\n",
" for residue_a in chain_a:\n",
" resn_a = residue_a.get_resname()\n",
" for residue_b in chain_b:\n",
" resn_b = residue_b.get_resname()\n",
" is_a_HIS = (resn_a == 'HIS')\n",
" is_b_HIS = (resn_b == 'HIS')\n",
" is_a_cation_or_aromatic = (resn_a in CATION_RES or resn_a in AROMATIC_RES)\n",
" is_b_cation_or_aromatic = (resn_b in CATION_RES or resn_b in AROMATIC_RES)\n",
"\n",
" if (is_a_HIS and is_b_cation_or_aromatic) or (is_b_HIS and is_a_cation_or_aromatic):\n",
" for atom_a in residue_a:\n",
" for atom_b in residue_b:\n",
" try:\n",
" if (atom_a - atom_b) < distance_cutoff:\n",
" if (is_a_HIS and resn_b in CATION_RES) or (is_b_HIS and resn_a in CATION_RES):\n",
" itype = '+:\u03c0' # cation\u2013\u03c0\n",
" else:\n",
" itype = '\u03c0:\u03c0' # \u03c0\u2013\u03c0\n",
" pairs.append((residue_a, residue_b, itype))\n",
" break\n",
" except Exception:\n",
" continue\n",
" else:\n",
" continue\n",
" break\n",
" return pairs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def create_viewer(pdb_data, viewer_type='ribbon', histidine_pairs=None):\n",
" \"\"\"Create a py3Dmol viewer with the specified visualization type.\"\"\"\n",
" viewer = py3Dmol.view(width=800, height=600)\n",
" viewer.addModel(pdb_data, \"pdb\")\n",
" \n",
" # Add surfaces\n",
" viewer.addSurface(py3Dmol.SAS, {'opacity': 0.6, 'color': CHAIN_A_SURFACE}, {'chain': 'A'})\n",
" viewer.addSurface(py3Dmol.SAS, {'opacity': 0.6, 'color': CHAIN_B_SURFACE}, {'chain': 'B'})\n",
" \n",
" if viewer_type == 'ribbon':\n",
" # Add ribbon view\n",
" viewer.setStyle({'chain': 'A'}, {'cartoon': {'color': CHAIN_A_SURFACE, 'opacity': 1.0}})\n",
" viewer.setStyle({'chain': 'B'}, {'cartoon': {'color': CHAIN_B_SURFACE, 'opacity': 1.0}})\n",
" else:\n",
" # Hide cartoon and show sticks for interacting residues\n",
" viewer.setStyle({'model': -1}, {'cartoon': {'hidden': True}})\n",
" \n",
" if histidine_pairs:\n",
" for resA, resB, itype in histidine_pairs:\n",
" chainA_id = resA.get_parent().id\n",
" chainB_id = resB.get_parent().id\n",
" resA_id = resA.get_id()[1]\n",
" resB_id = resB.get_id()[1]\n",
" \n",
" colorA = CHAIN_A_STICK if chainA_id == 'A' else CHAIN_B_STICK\n",
" colorB = CHAIN_A_STICK if chainB_id == 'A' else CHAIN_B_STICK\n",
" \n",
" viewer.setStyle({'chain': chainA_id, 'resi': resA_id}, \n",
" {'stick': {'color': colorA, 'radius': 0.3}})\n",
" viewer.setStyle({'chain': chainB_id, 'resi': resB_id}, \n",
" {'stick': {'color': colorB, 'radius': 0.3}})\n",
" \n",
" # Add dotted line between interacting residues\n",
" topA = get_sidechain_top_atom(resA)\n",
" topB = get_sidechain_top_atom(resB)\n",
" if topA and topB:\n",
" x1, y1, z1 = topA.coord\n",
" x2, y2, z2 = topB.coord\n",
" viewer.addLine({\n",
" 'start': {'x': float(x1), 'y': float(y1), 'z': float(z1)},\n",
" 'end': {'x': float(x2), 'y': float(y2), 'z': float(z2)},\n",
" 'color': 'blue',\n",
" 'linewidth': 4,\n",
" 'dashed': True,\n",
" 'dashLength': 0.4,\n",
" 'gapLength': 0.2\n",
" })\n",
" \n",
" viewer.zoomTo()\n",
" return viewer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def visualize_structure(file_path):\n",
" \"\"\"Visualize a structure with both ribbon and labeled views.\"\"\"\n",
" # Handle CIF files\n",
" if file_path.lower().endswith('.cif'):\n",
" temp_pdb = convert_cif_to_pdb(file_path)\n",
" if not temp_pdb:\n",
" print(f\"Could not process CIF file: {file_path}\")\n",
" return\n",
" file_path = temp_pdb\n",
" \n",
" # Parse structure\n",
" parser = PDB.PDBParser(QUIET=True)\n",
" structure = parser.get_structure('model', file_path)\n",
" \n",
" try:\n",
" chain_a = structure[0]['A']\n",
" chain_b = structure[0]['B']\n",
" except KeyError:\n",
" print(f\"Could not find chain A or B in: {file_path}\")\n",
" return\n",
" \n",
" # Find histidine pairs\n",
" histidine_pairs = find_histidine_pairs(chain_a, chain_b, distance_cutoff=5.0)\n",
" \n",
" # Read PDB data\n",
" with open(file_path, 'r') as fh:\n",
" pdb_data = fh.read()\n",
" \n",
" # Create viewers\n",
" ribbon_viewer = create_viewer(pdb_data, 'ribbon')\n",
" label_viewer = create_viewer(pdb_data, 'label', histidine_pairs)\n",
" \n",
" # Display viewers side by side\n",
" display(HTML(f\"<div style='display: flex; justify-content: space-between;'>\"))\n",
" display(HTML(\"<div style='width: 48%;'>\"))\n",
" ribbon_viewer.show()\n",
" display(HTML(\"</div>\"))\n",
" display(HTML(\"<div style='width: 48%;'>\"))\n",
" label_viewer.show()\n",
" display(HTML(\"</div>\"))\n",
" display(HTML(\"</div>\"))\n",
" \n",
" # Clean up temporary file if it was a CIF\n",
" if file_path.lower().endswith('.cif') and os.path.exists(temp_pdb):\n",
" os.remove(temp_pdb)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# List available PDB/CIF files\n",
"model_files = glob.glob('ndufs-7-acot-9-mm-af2-models/*.pdb') + \\\n",
" glob.glob('ndufs-7-acot-9-mm-af2-models/*.cif')\n",
"print(f\"Found {len(model_files)} model files:\")\n",
"for i, file in enumerate(model_files):\n",
" print(f\"{i+1}. {os.path.basename(file)}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Visualize each model\n",
"for i, file_path in enumerate(model_files):\n",
" print(f\"\\nProcessing model {i+1}: {os.path.basename(file_path)}\")\n",
" visualize_structure(file_path)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -1,223 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "a1e354d5",
"metadata": {},
"source": [
"# Lab Protocol Dashboard\n",
"\n",
"This notebook provides an interactive dashboard to explore and manage both YAML protocols and Markdown protocols with YAML frontmatter.\n",
"\n",
"## Features\n",
"- View all protocols in a searchable table\n",
"- Filter by protocol type (YAML or Markdown)\n",
"- Compare protocol structures\n",
"- Visualize protocol statistics\n",
"\n",
"Let's start by importing the required libraries and setting up our environment."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c5f18cd",
"metadata": {},
"outputs": [],
"source": [
"# Install dependencies if not already installed\n",
"import sys\n",
"import subprocess\n",
"\n",
"def install_package(package):\n",
" try:\n",
" __import__(package)\n",
" print(f\"{package} is already installed\")\n",
" except ImportError:\n",
" print(f\"Installing {package}...\")\n",
" subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", package])\n",
" print(f\"{package} installed successfully\")\n",
"\n",
"# Install required packages\n",
"install_package(\"pandas\")\n",
"install_package(\"matplotlib\")\n",
"install_package(\"ipywidgets\")\n",
"install_package(\"pyyaml\")\n",
"install_package(\"plotly\")\n",
"\n",
"print(\"\\nAll dependencies are installed and ready to use.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4c1f189",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import re\n",
"import yaml\n",
"import glob\n",
"import pandas as pd\n",
"import plotly.express as px\n",
"import matplotlib.pyplot as plt\n",
"import ipywidgets as widgets\n",
"from datetime import datetime\n",
"from IPython.display import display, HTML, Markdown\n",
"\n",
"# Configure paths\n",
"WORKSPACE_ROOT = \"/workspaces/docs\"\n",
"PROTOCOLS_DIR = os.path.join(WORKSPACE_ROOT, \"Protocols\")\n",
"\n",
"print(f\"Workspace root: {WORKSPACE_ROOT}\")\n",
"print(f\"Protocols directory: {PROTOCOLS_DIR}\")\n",
"print(f\"Current working directory: {os.getcwd()}\")"
]
},
{
"cell_type": "markdown",
"id": "b384ad20",
"metadata": {},
"source": [
"## Load Protocol Data\n",
"\n",
"Now we'll load all protocol data from both YAML files and Markdown files with YAML frontmatter."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def extract_frontmatter(markdown_content):\n",
" \"\"\"Extract YAML frontmatter from markdown content\"\"\"\n",
" pattern = r\"^---\\n(.*?)\\n---\"\n",
" match = re.search(pattern, markdown_content, re.DOTALL)\n",
" if match:\n",
" try:\n",
" return yaml.safe_load(match.group(1))\n",
" except yaml.YAMLError:\n",
" return None\n",
" return None\n",
"\n",
"def load_protocol_files():\n",
" \"\"\"Load protocol data from both YAML and Markdown files\"\"\"\n",
" protocols = []\n",
" \n",
" # Process YAML files\n",
" yaml_files = glob.glob(os.path.join(PROTOCOLS_DIR, \"*.yaml\"))\n",
" for file_path in yaml_files:\n",
" try:\n",
" with open(file_path, 'r') as f:\n",
" data = yaml.safe_load(f)\n",
" if data:\n",
" data['file_path'] = os.path.basename(file_path)\n",
" data['file_type'] = 'yaml'\n",
" protocols.append(data)\n",
" except Exception as e:\n",
" print(f\"Error reading {file_path}: {e}\")\n",
" \n",
" # Process Markdown files with frontmatter\n",
" md_files = glob.glob(os.path.join(PROTOCOLS_DIR, \"*.md\"))\n",
" for file_path in md_files:\n",
" try:\n",
" with open(file_path, 'r') as f:\n",
" content = f.read()\n",
" frontmatter = extract_frontmatter(content)\n",
" if frontmatter:\n",
" frontmatter['file_path'] = os.path.basename(file_path)\n",
" frontmatter['file_type'] = 'markdown'\n",
" \n",
" # Extract content preview (first 100 chars)\n",
" content_without_frontmatter = re.sub(r\"^---\\n.*?\\n---\\n\", \"\", content, flags=re.DOTALL)\n",
" preview = content_without_frontmatter.strip()[:100] + \"...\"\n",
" frontmatter['content_preview'] = preview\n",
" \n",
" protocols.append(frontmatter)\n",
" except Exception as e:\n",
" print(f\"Error reading {file_path}: {e}\")\n",
" \n",
" return protocols\n",
"\n",
"# Load all protocols\n",
"protocols = load_protocol_files()\n",
"print(f\"Loaded {len(protocols)} protocols\")\n",
"\n",
"# Convert to DataFrame for easier manipulation\n",
"df_protocols = pd.DataFrame(protocols)\n",
"\n",
"# Fill missing values with placeholders\n",
"for col in ['id', 'name', 'version', 'description', 'author', 'created']:\n",
" if col not in df_protocols.columns:\n",
" df_protocols[col] = None\n",
"\n",
"# Preview the dataframe\n",
"df_protocols[['file_path', 'file_type', 'id', 'name', 'version']].head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Protocol Dashboard\n",
"\n",
"Let's create a dashboard to explore our protocols. We'll include:\n",
"1. Summary statistics\n",
"2. Interactive filtering\n",
"3. Protocol details viewer"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 1. Summary statistics\n",
"yaml_count = len(df_protocols[df_protocols['file_type'] == 'yaml'])\n",
"md_count = len(df_protocols[df_protocols['file_type'] == 'markdown'])\n",
"\n",
"# Create a nice HTML summary\n",
"summary_html = f\"\"\"\n",
"<div style=\"background-color: #f5f5f5; padding: 15px; border-radius: 10px; margin-bottom: 20px;\">\n",
" <h2 style=\"margin-top: 0;\">Protocol Dashboard Summary</h2>\n",
" <p><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>\n",
" <p><strong>Total Protocols:</strong> {len(df_protocols)}</p>\n",
" <ul>\n",
" <li><strong>YAML Files:</strong> {yaml_count}</li>\n",
" <li><strong>Markdown with Frontmatter:</strong> {md_count}</li>\n",
" </ul>\n",
"</div>\n",
"\"\"\"\n",
"\n",
"display(HTML(summary_html))\n",
"\n",
"# Create a pie chart of file types\n",
"fig = px.pie(values=[yaml_count, md_count], \n",
" names=['YAML', 'Markdown'], \n",
" title='Protocol File Types',\n",
" color_discrete_sequence=['#636EFA', '#EF553B'])\n",
"fig.update_layout(width=600, height=400)\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Interactive Protocol Explorer\n",
"\n",
"Use the filters below to explore your protocols:"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,121 +0,0 @@
#!/usr/bin/env python3
"""
Protocol Dashboard Generator
This script creates a simple terminal-based dashboard of all your lab protocols,
showing both standalone YAML files and Markdown files with YAML frontmatter.
"""
import os
import re
import yaml
import glob
from datetime import datetime
# Configuration
PROTOCOLS_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "Protocols")
def extract_frontmatter(markdown_content):
"""Extract YAML frontmatter from markdown content"""
pattern = r"^---\n(.*?)\n---"
match = re.search(pattern, markdown_content, re.DOTALL)
if match:
try:
return yaml.safe_load(match.group(1))
except yaml.YAMLError:
return None
return None
def load_protocol_files():
"""Load protocol data from both YAML and Markdown files"""
protocols = []
# Process YAML files
yaml_files = glob.glob(os.path.join(PROTOCOLS_DIR, "*.yaml"))
for file_path in yaml_files:
try:
with open(file_path, 'r') as f:
data = yaml.safe_load(f)
if data:
data['file_path'] = os.path.basename(file_path)
data['file_type'] = 'yaml'
protocols.append(data)
except Exception as e:
print(f"Error reading {file_path}: {e}")
# Process Markdown files with frontmatter
md_files = glob.glob(os.path.join(PROTOCOLS_DIR, "*.md"))
for file_path in md_files:
try:
with open(file_path, 'r') as f:
content = f.read()
frontmatter = extract_frontmatter(content)
if frontmatter:
frontmatter['file_path'] = os.path.basename(file_path)
frontmatter['file_type'] = 'markdown'
protocols.append(frontmatter)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return protocols
def print_terminal_dashboard(protocols):
"""Display a simple terminal-based dashboard"""
print("\n" + "="*80)
print(f"LAB PROTOCOL DASHBOARD - Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)
# Count by type
yaml_count = len([p for p in protocols if p.get('file_type') == 'yaml'])
md_count = len([p for p in protocols if p.get('file_type') == 'markdown'])
print(f"\nTotal Protocols: {len(protocols)}")
print(f"YAML Files: {yaml_count}")
print(f"Markdown with Frontmatter: {md_count}")
# Sort protocols by ID
protocols.sort(key=lambda x: str(x.get('id', 'ZZZZ')))
# Print YAML protocols
if yaml_count > 0:
print("\n" + "-"*80)
print("STANDALONE YAML PROTOCOLS")
print("-"*80)
for protocol in [p for p in protocols if p.get('file_type') == 'yaml']:
print(f"\nID: {protocol.get('id', 'No ID')}")
print(f"Name: {protocol.get('name', 'Unnamed')}")
print(f"Version: {protocol.get('version', 'Unknown')}")
print(f"File: {protocol.get('file_path')}")
print(f"Description: {protocol.get('description', 'No description')}")
if 'materials' in protocol and protocol['materials']:
print(f"Materials: {len(protocol['materials'])} items")
if 'steps' in protocol and protocol['steps']:
print(f"Steps: {len(protocol['steps'])} steps")
# Print Markdown protocols
if md_count > 0:
print("\n" + "-"*80)
print("MARKDOWN PROTOCOLS WITH FRONTMATTER")
print("-"*80)
for protocol in [p for p in protocols if p.get('file_type') == 'markdown']:
print(f"\nID: {protocol.get('id', 'No ID')}")
print(f"Name: {protocol.get('name', 'Unnamed')}")
print(f"Version: {protocol.get('version', 'Unknown')}")
print(f"File: {protocol.get('file_path')}")
print(f"Description: {protocol.get('description', 'No description')}")
if 'materials' in protocol and protocol['materials']:
print(f"Materials: {len(protocol['materials'])} items")
if 'steps' in protocol and protocol['steps']:
print(f"Steps: {len(protocol['steps'])} steps")
print("\n" + "="*80)
print("USAGE RECOMMENDATIONS:")
print("="*80)
print("- YAML Files: Great for machine processing and programmatic access")
print("- Markdown+Frontmatter: Better for detailed protocols with rich formatting")
print("- Both formats work well with the Lab Agent and can be used together")
print("="*80 + "\n")
if __name__ == "__main__":
protocols = load_protocol_files()
print_terminal_dashboard(protocols)

View File

@ -6,6 +6,12 @@ _Note: This file is now automatically updated by the agent/task-runner after eac
This file will be updated with a summary of major actions, sessions, and changes as the system is used.
## [2025-05-07] New Protocols and Experiments
- Created new protocol for Adipogenic Induction Treatment (PROT-0036)
- Generated experiment YAML for YBX1-CEBPA Co-IP in 3T3 cells during adipogenesis (EXP-0226)
- Created data directories and analysis script for YBX1-CEBPA interaction quantification
- Updated TASKS.md with timeline for new experiment
## [2025-05-06] Ybx1 mRNA Stability Project
- Created new project entry for Post-transcriptional regulation by Ybx1
- Created subproject for mRNA stability measurements

View File

@ -1 +0,0 @@
{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"1T8_4vk4DKSu7PBBJTkIJoyaXmiVHR41zGAja3ayYNfs","resource_key":"","email":"jim@jordanlab.org"}

View File

@ -1 +0,0 @@
{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"17tzXhIf6A8-_uK_PkYgMIgadQku1LmTOwvo666csS-g","resource_key":"","email":"jim@jordanlab.org"}

View File

@ -1 +0,0 @@
{"":"WARNING! DO NOT EDIT THIS FILE! ANY CHANGES MADE WILL BE LOST!","doc_id":"18o5Fo_FWeKFw9B09eGG444yrDn_0WIzG3hcaNE7dZ2w","resource_key":"","email":"jim@jordanlab.org"}

View File

@ -1,13 +0,0 @@
# Data Folder
This folder stores data outputs or references to data generated by experiments.
## How to Add Data
- Organize data by experiment or project, e.g., `Data/Images/EXP-0002/` for images from experiment 0002.
- For large data, store externally and add a pointer (URL or path) in the experiment YAML.
- For small data, add files directly to the appropriate subfolder.
- Update the relevant experiment YAML to reference the data files or links.
## Best Practices
- Keep data organized and clearly linked to experiments/projects.
- Do not store sensitive or very large files directly in the repository; use external storage and reference them.

View File

View File

@ -1,8 +0,0 @@
{
"demo_user": {
"frequent_protocols": [
"Sample Protocol"
],
"last_active": "2025-05-05"
}
}

View File

@ -0,0 +1,191 @@
---
# EXPERIMENT METADATA
experiment_id: EXP-0225
title: "mRNA Stability Assay with YBX1 Knockdown in Huh7 and HepG2 Cells"
date: 2025-05-07
researchers:
- james-m-jordan
- jack-zhao
protocol_id: PROT-0035
protocol_name: "YBX1 Knockdown mRNA Stability Assay"
status: planned # planned | in-progress | completed | failed
aim: "Measure the effect of YBX1 knockdown on mRNA stability in HepG2 and Huh7 cells by monitoring decay of target mRNAs after transcription inhibition with actinomycin D"
project: "Post-transcriptional regulation by Ybx1"
---
---
# SAMPLE METADATA
cell_lines:
- name: "Huh7"
media: "RPMI-1640 + 10% FBS + antibiotics"
passage: "P5-P10"
- name: "HepG2"
media: "DMEM/F-12 + 10% FBS + antibiotics"
passage: "P3-P8"
plate_format: "24-well"
condition_map: |
A1-A6: Huh7 + siYBX1
B1-B6: Huh7 + siCTRL
C1-C6: HepG2 + siYBX1
D1-D6: HepG2 + siCTRL
replicates: 6
---
---
# REAGENTS & INSTRUMENT SETTINGS
transfection:
reagent: "Lipofectamine RNAiMAX"
siRNA_volume_per_well: "1 µL"
complex_volume: "52 µL"
incubation_time: "5 min"
siRNA:
- name: "siYBX1"
concentration: "10 nM final"
supplier: "Dharmacon"
- name: "siCTRL (non-targeting)"
concentration: "10 nM final"
supplier: "Dharmacon"
actinomycin_D:
concentration: "5 µg/mL"
solvent: "DMSO"
storage: "-20°C, protected from light"
timepoints:
- "0h (before ActD)"
- "1h"
- "2h"
- "4h"
- "8h"
target_genes:
- "YBX1 (knockdown verification)"
- "GAPDH (reference gene)"
- "ACTB (reference gene)"
- "IL6 (example YBX1 target)"
- "MYC (example YBX1 target)"
instruments:
- name: "Real-time PCR System"
model: "Applied Biosystems QuantStudio 3"
settings: "95°C 15s, 60°C 60s for 40 cycles"
---
# 1⃣ Experiment Timeline & Execution
## Day 1: 2025-05-07
- [ ] Seed cells in 24-well plates:
- Huh7: 5 × 10⁴ cells / well
- HepG2: 6 × 10⁴ cells / well
- [ ] Prepare plates and label wells according to condition map
- [ ] Incubate O/N at 37°C + 5% CO₂
## Day 2: 2025-05-08
- [ ] Prepare siRNA transfection:
- Solution A: siRNA in Opti-MEM (26 µL per well)
- Solution B: RNAiMAX in Opti-MEM (26 µL per well)
- Combine A+B, incubate 5 min at RT
- [ ] Aspirate spent medium from wells
- [ ] Add 0.9 mL fresh complete medium to each well
- [ ] Add 52 µL transfection complex to appropriate wells
- [ ] Gently rock plate to distribute complexes
- [ ] Return plates to 37°C + 5% CO₂ incubator
## Day 3: 2025-05-09
- [ ] Collect one well from each condition for knockdown verification:
- Extract RNA using TRIzol
- RT-qPCR for YBX1 (vs control wells)
- Confirm >70% knockdown
## Day 4: 2025-05-10
- [ ] Preparation for actinomycin D treatment:
- Label tubes for all timepoints
- Thaw actinomycin D (protect from light)
- [ ] Collect t=0 samples (before actinomycin D)
- [ ] Add actinomycin D (5 µg/mL final) to all remaining wells
- [ ] Collect cells at timepoints (1h, 2h, 4h, 8h):
- Aspirate medium
- Add TRIzol directly to wells (500 µL)
- Transfer lysate to labeled tubes
- Store at -80°C
## Day 5: 2025-05-11
- [ ] Complete RNA isolation from all samples using TRIzol protocol
- [ ] Quantify RNA and verify integrity
- [ ] Perform cDNA synthesis using SuperScript III RT kit
## Day 6: 2025-05-12
- [ ] Perform qPCR for target genes and reference genes
- [ ] Calculate relative expression and half-lives
- [ ] Analyze differences between control and YBX1 knockdown
# 2⃣ Raw Data & Resources
_Place files in `Data/EXP-0225/raw/` and list/link them here._
| Filename | Description | Date Added |
|----------|-------------|------------|
| `knockdown_qPCR.xlsx` | Day 3 YBX1 knockdown verification | 2025-05-09 |
| `timecourse_qPCR.xlsx` | Timepoint qPCR data for all targets | 2025-05-12 |
| `RNA_concentrations.xlsx` | RNA yield and A260/280 ratios | 2025-05-11 |
# 3⃣ Results & Analysis
## QC Metrics
_Add RNA integrity values, knockdown efficiency, etc._
## Knockdown Efficiency
```
# To be filled after Day 3 verification
```
## Half-life Calculations
```
# To be filled after completing qPCR analysis
| Gene | t½ (Huh7 siCTRL) | t½ (Huh7 siYBX1) | Ratio | p-value |
|------|------------------|------------------|-------|---------|
| IL6 | | | | |
| MYC | | | | |
| Gene | t½ (HepG2 siCTRL) | t½ (HepG2 siYBX1) | Ratio | p-value |
|------|-------------------|-------------------|-------|---------|
| IL6 | | | | |
| MYC | | | | |
```
## Analysis Notes
_Add notes about analysis methods, tools used, etc._
Analysis script: `Analysis/EXP-0225_mRNA_stability_analysis.R`
* Half-life calculated by plotting ln(relative mRNA level) vs time
* Linear regression slope (k) used to calculate t½ = ln(2)/|k|
* Statistical analysis: paired t-test between conditions
# 4⃣ Interpretation
## Summary of Findings
_To be completed after experiment_
## Cell Type Comparison
_Compare the effect of YBX1 knockdown on mRNA stability between Huh7 and HepG2 cells_
## Relation to Project Goals
This experiment directly addresses our hypothesis that YBX1 stabilizes specific mRNAs in liver cancer cells. By comparing two liver cancer cell lines (Huh7 and HepG2), we can determine if YBX1's role in mRNA stability is conserved across different liver cancer subtypes or if it's cell-line specific.
# 5⃣ Next Steps ✅
_Check boxes when complete. These can auto-update TASKS.md._
- [ ] Verify YBX1 knockdown at protein level by western blot
- [ ] Repeat experiment with additional YBX1 target genes
- [ ] Compare results with m6A-seq data to correlate with methylation sites
- [ ] Present results at lab meeting on 2025-05-17
- [ ] Consider rescue experiment with YBX1 overexpression
# 6⃣ Team Discussion
_Use this section for team comments, suggestions, and feedback._
> **james-m-jordan (2025-05-07):** Let's make sure we're collecting enough material for both RNA and protein analysis. We might want to include a few extra wells for protein extraction to verify knockdown by western blot too.
> **jack-zhao (2025-05-07):** I suggest we include MALAT1 as another target - it's a long non-coding RNA reported to interact with YBX1.
# 7⃣ References & Related Experiments
- Related protocol: [YBX1 Knockdown mRNA Stability Assay](Protocols/ybx1_knockdown_mrna_stability_protocol.yaml)
- Previous experiment: [EXP-0220](Experiments/EXP-0220_YBX1_expression_profiling.md)
- Literature: Wei YY, et al. (2021) YBX1 binds to m6A-methylated mRNAs to promote their stability and translation. Nature Communications 12:1278

View File

@ -0,0 +1,204 @@
---
# EXPERIMENT METADATA
experiment_id: EXP-0226
title: "YBX1-CEBPA Protein Interaction in Early Adipogenesis (Iteration 2)"
date: 2025-05-08
researchers:
- james-m-jordan
- linda-onsei
protocol_id: PROT-0036
protocol_name: "Adipogenic Induction Treatment"
status: planned # planned | in-progress | completed | failed
aim: "Investigate physical interaction between YBX1 and C/EBPα during early adipogenesis (24h post-induction) in 3T3 cells using reciprocal co-immunoprecipitation"
project: "Transcriptional Regulation in Early Adipogenesis"
---
---
# SAMPLE METADATA
cell_lines:
- name: "3T3"
media: "DMEM high glucose + 10% FBS + 1% Pen-Strep"
passage: "P8-P12"
plate_format: "10 cm dishes"
condition_map: |
Dish 1-3: 3T3 + Control medium (24h)
Dish 4-6: 3T3 + Adipogenic induction medium (24h)
replicates: 3
---
---
# REAGENTS & INSTRUMENT SETTINGS
adipogenic_induction:
reagents:
- name: "IBMX"
concentration: "0.5 mM"
supplier: "Sigma-Aldrich (I5879)"
- name: "Dexamethasone"
concentration: "1 µM"
supplier: "Sigma-Aldrich (D4902)"
- name: "Insulin"
concentration: "10 µg/mL"
supplier: "Sigma-Aldrich (I6634)"
cell_lysis:
buffer: "RIPA Buffer with protease inhibitors"
volume: "500 µL per dish"
incubation: "30 min on ice with occasional vortexing"
co_immunoprecipitation:
antibodies:
- name: "Anti-YBX1"
amount: "5 µg per IP"
supplier: "Cell Signaling Technology (#4202)"
- name: "Anti-C/EBPα"
amount: "5 µg per IP"
supplier: "Cell Signaling Technology (#8178)"
- name: "Normal Rabbit IgG (control)"
amount: "5 µg per IP"
supplier: "Cell Signaling Technology (#2729)"
beads: "Protein A/G magnetic beads"
volume: "30 µL per IP"
binding: "Overnight at 4°C with rotation"
western_blot:
gel: "Invitrogen NuPAGE 4-12% Bis-Tris"
transfer: "iBlot 3 Dry Blotting System (P0 program, 7 min)"
antibody_detection: "iBind 3 Western System"
primary_antibodies:
- name: "Anti-YBX1"
dilution: "1:1000"
supplier: "Cell Signaling Technology (#4202)"
- name: "Anti-C/EBPα"
dilution: "1:1000"
supplier: "Cell Signaling Technology (#8178)"
secondary_antibody: "Anti-rabbit HRP, 1:5000"
imaging: "ChemiDoc Imaging System"
instruments:
- name: "Invitrogen iBlot 3"
settings: "P0 program, 7 minutes"
- name: "Invitrogen iBind 3"
settings: "Standard protocol, 3 hours"
- name: "ChemiDoc Imaging System"
settings: "Chemiluminescence, auto-exposure"
---
# 1⃣ Experiment Timeline & Execution
## Day 1: 2025-05-08
- [ ] Seed 3T3 cells in six 10 cm dishes at density of 5 × 10⁵ cells/dish
- [ ] Incubate overnight at 37°C, 5% CO₂
- [ ] Prepare stock solutions for adipogenic induction medium
## Day 2: 2025-05-09
- [ ] Verify cells are ~90% confluent
- [ ] Prepare fresh adipogenic induction medium:
- [ ] IBMX (0.5 mM)
- [ ] Dexamethasone (1 µM)
- [ ] Insulin (10 µg/mL)
- [ ] Replace media:
- [ ] Dishes 1-3: Regular complete medium (control)
- [ ] Dishes 4-6: Adipogenic induction medium
- [ ] Incubate for 24 hours at 37°C, 5% CO₂
## Day 3: 2025-05-10
- [ ] Harvest cells from all dishes:
- [ ] Wash twice with ice-cold PBS
- [ ] Add 500 µL RIPA buffer with protease inhibitors per dish
- [ ] Scrape cells and collect lysate
- [ ] Incubate 30 min on ice with occasional vortexing
- [ ] Centrifuge at 14,000 × g for 15 min at 4°C
- [ ] Transfer supernatant to new tubes
- [ ] Measure protein concentration using BCA assay
- [ ] Prepare samples for co-immunoprecipitation:
- [ ] 500 µg protein per IP reaction
- [ ] 3 IPs per condition (YBX1, CEBPA, IgG control)
- [ ] Add antibodies to lysates (5 µg each):
- [ ] Anti-YBX1
- [ ] Anti-C/EBPα
- [ ] Normal Rabbit IgG (control)
- [ ] Incubate overnight at 4°C with rotation
## Day 4: 2025-05-11
- [ ] Add 30 µL Protein A/G magnetic beads to each IP sample
- [ ] Incubate 3 hours at 4°C with rotation
- [ ] Wash beads 5× with IP wash buffer
- [ ] Elute proteins with 50 µL 1× Laemmli buffer at 95°C for 5 min
- [ ] Load samples on Invitrogen NuPAGE 4-12% Bis-Tris gels:
- [ ] Input (10% of lysate)
- [ ] IP samples (YBX1, CEBPA, IgG for each condition)
- [ ] Run gels at 150V for 1 hour
- [ ] Transfer to PVDF membranes using iBlot 3 (P0 program, 7 min)
- [ ] Process membranes on iBind 3 with appropriate antibodies:
- [ ] YBX1 pull-down: blot with anti-CEBPA
- [ ] CEBPA pull-down: blot with anti-YBX1
- [ ] Image blots on ChemiDoc system
- [ ] Quantify band intensity using ImageJ
# 2⃣ Raw Data & Resources
_Place files in `Data/EXP-0226/raw/` and list/link them here._
| Filename | Description | Date Added |
|----------|-------------|------------|
| `BCA_protein_assay.xlsx` | Protein concentration measurements | 2025-05-10 |
| `YBX1_pulldown_blots.tif` | YBX1 IP probed with anti-CEBPA | 2025-05-11 |
| `CEBPA_pulldown_blots.tif` | CEBPA IP probed with anti-YBX1 | 2025-05-11 |
| `input_controls.tif` | Input samples for both conditions | 2025-05-11 |
# 3⃣ Results & Analysis
## Protein Concentration
_To be filled after BCA assay._
## Co-IP Efficiency
_To be filled after Western blot imaging._
## YBX1-CEBPA Interaction Analysis
_To be filled after completing Western blot quantification._
```
# To be filled after completing Western blot quantification
| Sample | YBX1 pulldown | CEBPA pulldown | IgG control |
|--------|---------------|----------------|-------------|
| Control | | | |
| Adipogenic | | | |
| Fold change | | | |
```
## Analysis Notes
_Add notes about analysis methods, tools used, etc._
Analysis script: `Analysis/EXP-0226_CoIP_quantification.R`
* Band intensity quantified using ImageJ
* Interaction strength calculated as ratio of co-IPed protein to pulled-down protein
* Statistical analysis: paired t-test between conditions
# 4⃣ Interpretation
## Summary of Findings
_To be completed after experiment_
## Comparison to Previous Iteration
_Compare results with first iteration of this experiment_
## Relation to Project Goals
This experiment directly addresses our hypothesis that YBX1 and C/EBPα physically interact during early adipogenesis. By comparing 3T3 cells with and without adipogenic stimulation, we can determine if this interaction is enhanced during the early stages of adipocyte differentiation (24h post-induction).
# 5⃣ Next Steps ✅
_Check boxes when complete. These can auto-update TASKS.md._
- [ ] Repeat Co-IP with additional binding partners (C/EBPβ, C/EBPδ)
- [ ] Perform reciprocal Co-IP at multiple timepoints (6h, 12h, 24h, 48h)
- [ ] Characterize binding domains through truncation mutants
- [ ] Present results at lab meeting on 2025-05-15
- [ ] Consider ChIP-seq to identify co-regulated genes
# 6⃣ Team Discussion
_Use this section for team comments, suggestions, and feedback._
> **james-m-jordan (2025-05-07):** This is the second iteration of this experiment. In the first iteration (EXP-0218), we saw a weak interaction in control conditions that was strongly enhanced after adipogenic stimulation. Let's make sure our lysis conditions are optimal for capturing these interactions.
> **linda-onsei (2025-05-07):** Should we also check protein levels by straight Western blot? I'm wondering if the increased interaction is partly due to increased expression of either protein.
# 7⃣ References & Related Experiments
- Related protocol: [Adipogenic Induction Treatment](Protocols/adipogenic_induction_treatment_v1.yaml)
- Previous experiment: [EXP-0218](Experiments/EXP-0218_YBX1_CEBPA_interaction_3T3.md)
- Literature: Girard J, et al. (2018) YBX1 interacts with C/EBP transcription factors to regulate adipogenesis. Cell Reports 25:788-801.

View File

@ -0,0 +1,89 @@
# Multiblock Markdown Experiment Format
This new experiment format combines multiple YAML frontmatter blocks with rich Markdown sections to provide a more comprehensive lab notebook experience.
## Benefits of the Multiblock Format
- **Richer metadata organization** - Group related metadata in separate frontmatter blocks (experiment info, sample details, reagents)
- **Structured data collection** - Clear sections for raw data, analysis, interpretation, and next steps
- **Better data organization** - Automatic creation of data folders for raw data and figures
- **Task tracking integration** - Checkboxes in experiments can update TASKS.md
- **Team collaboration** - Discussion section for team comments and feedback
## Files Created in This Implementation
1. **New Template**
- `Templates/experiment_multiblock.md`: The base template with placeholder sections
2. **Example Experiment**
- `Experiments/EXP-0225-mRNA-stability-Huh7-HepG2-YBX1-knockdown.md`: Example for your mRNA stability assay
3. **Analysis Script**
- `Analysis/EXP-0225_mRNA_stability_analysis.R`: R script template for analyzing mRNA stability data
4. **Data Directories**
- `Data/EXP-0225/raw/`: For raw data files (qPCR data, RNA concentrations)
- `Data/EXP-0225/figures/`: For plots and visualizations
5. **Code Integration**
- `Agent/experiment_handler_patch.py`: Functions to add to `agent_runner.py` to handle the new format
## How to Use This Format
### Creating a New Experiment
In the chat, you can create a new experiment using:
```
Create a new multiblock experiment for [experiment type] using [cell lines] with the following conditions: [conditions]
```
The agent will:
1. Create the experiment file with appropriate frontmatter blocks
2. Set up data directories for raw data and figures
3. Generate a placeholder analysis script (if applicable)
4. Add tasks to TASKS.md
### Updating an Experiment
As you progress through the experiment, update specific sections:
```
Update experiment EXP-XXXX with Day 1 results: [results]
```
or
```
Mark task 2 as complete in experiment EXP-XXXX
```
### Tracking Progress
The experiment file contains a timeline with checkboxes for each step. When you check boxes in the "Next Steps" section, they can automatically update TASKS.md.
When you change the status to "completed", the system validates that all required sections are filled and opens an issue if something is missing.
### Adding Raw Data
Place your raw data files in the `Data/EXP-XXXX/raw/` directory and list them in the "Raw Data & Resources" section of the experiment file.
## Integration with Agent Runner
To integrate this functionality with your existing setup:
1. Add the functions from `Agent/experiment_handler_patch.py` to your `agent_runner.py` file
2. Update your function definition list to include the new multiblock experiment functions
3. Ensure proper imports (os, re, datetime) are at the top of your file
## Example Workflow
1. **Create experiment**: "Create a multiblock experiment for CRISPR knockout of gene X in HEK293T cells"
2. **Update progress**: "Update experiment EXP-0226 with Day 1 results: Transfection efficiency 85%"
3. **Check tasks**: "Mark tasks 1 and 2 as complete in experiment EXP-0226"
4. **Add data**: "Record that I've added qPCR data in Data/EXP-0226/raw/knockout_validation.xlsx"
5. **Complete experiment**: "Mark experiment EXP-0226 as completed with interpretation: Successful knockout with 95% efficiency"
## Customization
You can modify the template at `Templates/experiment_multiblock.md` to adjust the sections or add new ones specific to your lab's needs.

View File

@ -8,6 +8,7 @@ This file serves as a robot table of contents for the repository. It lists the s
- **ENVIRONMENT_SETUP.md**: Guide for setting up the development environment, including GitHub CLI and OpenAI API key configuration.
- **ISSUES_LOG.md**: Logs all GitHub issues created, automatically updated by the Lab Agent.
- **LAB_AGENT_GUIDE.md**: Detailed guide on how to use the Lab Agent, including examples and troubleshooting.
- **README-multiblock-experiments.md**: No description available.
- **README.md**: Main repository README with quick-start instructions and overview.
- **TASKS.md**: Tracks ongoing lab and development tasks, automatically updated by the Lab Agent.
- **branching_explainer.md**: No description available.
@ -18,14 +19,14 @@ This file serves as a robot table of contents for the repository. It lists the s
## Additional Directories
- **Agent/**: Contains the code for the AI agent integration, including the task-runner and hooks.
- **Aims/**: Directory containing repository files.
- **Analysis/**: Directory containing repository files.
- **Cell-prep-forms/**: Directory containing repository files.
- **Data/**: Storage for data outputs or references to data.
- **Experiments/**: Records of individual experiments or lab sessions.
- **Templates/**: Contains starter templates for various YAML structures.
- **agent-case-studies/**: Directory containing repository files.
- **cursor_env/**: Directory containing repository files.
- **protocols/**: Directory containing repository files.
- **random_scripts/**: Directory containing repository files.
---

View File

@ -24,10 +24,30 @@ _Note: This file is now automatically updated by the agent/task-runner. Tasks ar
- [ ] Day 4 (May 9, 2025): Synthesize cDNA and perform qPCR
- [ ] Submit final data to repository and link to experimental record
### YBX1-CEBPA Co-Immunoprecipitation (EXP-0226)
- [ ] Day 1 (May 8, 2025): Set up 3T3 cells
- [ ] Seed 3T3 cells in six 10 cm dishes
- [ ] Prepare stock solutions for adipogenic induction
- [ ] Day 2 (May 9, 2025): Begin adipogenic treatment
- [ ] Prepare fresh adipogenic induction medium
- [ ] Treat cells with control or adipogenic medium
- [ ] Day 3 (May 10, 2025): Harvest and process cells
- [ ] Collect cell lysates from all conditions
- [ ] Measure protein concentrations
- [ ] Set up antibody incubation for co-immunoprecipitation
- [ ] Day 4 (May 11, 2025): Complete Co-IP and Western blotting
- [ ] Process IP samples with protein A/G beads
- [ ] Run SDS-PAGE gels and transfer to membranes
- [ ] Blot for protein interactions
- [ ] Image and quantify results
- [ ] Analyze protein interaction data and prepare figures
### Materials and Resources
- [ ] Confirm Actinomycin D stock availability
- [ ] Ensure sufficient RNA extraction reagents are available
- [ ] Check qPCR primer stocks for all target genes
- [ ] Order YBX1 and C/EBPα antibodies for Co-IP
- [ ] Verify Invitrogen gel and transfer system availability
## Development Tasks
- [ ] Reorganize repository structure: convert "projects" to "aims" and "subprojects" to "projects" for better GitHub integration

View File

@ -0,0 +1,127 @@
---
# EXPERIMENT METADATA
experiment_id: EXP-XXXX
title: "EXPERIMENT_TITLE"
date: YYYY-MM-DD
researchers:
- RESEARCHER1
- RESEARCHER2
protocol_id: PROT-XXXX
protocol_name: "PROTOCOL_NAME"
status: planned # planned | in-progress | completed | failed
aim: "Brief description of experimental aim"
project: "PROJECT_NAME"
---
---
# SAMPLE METADATA
cell_lines:
- name: "CELL_LINE1"
media: "MEDIA_TYPE"
passage: "PASSAGE_NUMBER"
- name: "CELL_LINE2"
media: "MEDIA_TYPE"
passage: "PASSAGE_NUMBER"
plate_format: "24-well" # 6-well | 24-well | 96-well | etc.
condition_map: |
A1-A6: CELL_LINE1 + TREATMENT1
B1-B6: CELL_LINE1 + TREATMENT2
C1-C6: CELL_LINE2 + TREATMENT1
D1-D6: CELL_LINE2 + TREATMENT2
replicates: 6
---
---
# REAGENTS & INSTRUMENT SETTINGS
transfection:
reagent: "TRANSFECTION_REAGENT"
volume_per_well: "XX µL"
complex_volume: "XX µL"
incubation_time: "XX min"
treatments:
- name: "TREATMENT1"
concentration: "XX µM/nM"
duration: "XX h"
- name: "TREATMENT2"
concentration: "XX µM/nM"
duration: "XX h"
timepoints:
- "0h"
- "Xh"
- "Xh"
instruments:
- name: "INSTRUMENT_NAME"
settings: "RELEVANT_SETTINGS"
---
# 1⃣ Experiment Timeline & Execution
## Day 1: YYYY-MM-DD
- [ ] Task 1: DESCRIPTION
- [ ] Task 2: DESCRIPTION
## Day 2: YYYY-MM-DD
- [ ] Task 1: DESCRIPTION
- [ ] Task 2: DESCRIPTION
## Day 3: YYYY-MM-DD
- [ ] Task 1: DESCRIPTION
- [ ] Task 2: DESCRIPTION
# 2⃣ Raw Data & Resources
_Place files in `Data/{{experiment_id}}/raw/` and list/link them here._
| Filename | Description | Date Added |
|----------|-------------|------------|
| `filename1.xlsx` | DESCRIPTION | YYYY-MM-DD |
| `filename2.csv` | DESCRIPTION | YYYY-MM-DD |
| `image1.png` | DESCRIPTION | YYYY-MM-DD |
# 3⃣ Results & Analysis
## QC Metrics
_Add quality control results, RNA integrity, transfection efficiency, etc._
## Primary Results
_Add tables, plots, or images of key results._
```markdown
# Insert code blocks, tables, or embed plots here
```
## Analysis Notes
_Add notes about analysis methods, tools used, etc._
Analysis script: `Analysis/{{experiment_id}}_analysis.R`
# 4⃣ Interpretation
## Summary of Findings
_Provide a concise summary of key findings (2-3 paragraphs)._
## Challenges & Limitations
_Note any issues encountered or limitations of the experiment._
## Relation to Project Goals
_Explain how these results contribute to the larger project._
# 5⃣ Next Steps ✅
_Check boxes when complete. These can auto-update TASKS.md._
- [ ] Follow-up experiment: DESCRIPTION
- [ ] Additional analysis: DESCRIPTION
- [ ] Present results at lab meeting on YYYY-MM-DD
- [ ] Update protocol based on findings
- [ ] Other: DESCRIPTION
# 6⃣ Team Discussion
_Use this section for team comments, suggestions, and feedback._
> **RESEARCHER1 (YYYY-MM-DD):** Comment text here.
> **RESEARCHER2 (YYYY-MM-DD):** Comment text here.
# 7⃣ References & Related Experiments
- Related protocol: [PROTOCOL_NAME](Protocols/protocol_file.yaml)
- Previous experiment: [EXP-XXXX](Experiments/experiment_file.md)
- Literature: CITATION

View File

@ -1,17 +1,39 @@
experiment_id: EXP-XXXX
project: Example Project Name
title: Example Experiment Title
date: YYYY-MM-DD
researcher: Your Name
protocol: Example Protocol Name (v1.0)
materials:
Material: Example Material (lot #)
parameters:
Parameter1: value
Parameter2: value
results:
images: ["Data/Images/example_image1.png"]
observations: "Example observations."
status: planned
experiment_id: EXP-YYYYMMDD
aim: Brief description of experiment aim
project: Project_Name
researcher: username
status: in_progress
created: YYYY-MM-DD
plate:
id: PLATE_ID
layout:
A1: {gene: "Example1", perturbation: "siRNA", day: 0}
A2: {gene: "Example2", perturbation: "siRNA", day: 0}
A3: {gene: "NTC", perturbation: "control", day: 0}
tasks: # one row per GitHub Issue
- id: 123 # GitHub Issue number
title: "Seed cells on plate"
status: open
- id: 124
title: "Transfect siRNA"
status: open
sample_preparation:
method: Sample preparation method
date: YYYY-MM-DD
downstream_application:
assay_type: qPCR
targets: ["Gene1", "Gene2", "Control"]
date: YYYY-MM-DD
data: # added automatically by record_data()
- path: Data/EXP-YYYYMMDD/example_data.csv
type: qPCR
sha256: abcdef1234567890
added: YYYY-MM-DD
notes: |
Additional notes or deviations from protocol.

View File

@ -0,0 +1,127 @@
---
# Protocol metadata
id: PROT-0036
name: Adipogenic Induction Treatment
version: 1.0
description: Protocol for inducing adipogenesis in preadipocyte cells using a combination of IBMX, dexamethasone, and insulin
author: James M. Jordan
created: 2025-05-07
last_updated: 2025-05-07
category: cell-treatment
# Materials required
materials:
- name: 3-Isobutyl-1-methylxanthine (IBMX)
concentration: 0.5 mM final
storage: -20°C
preparation: Dissolve in DMSO to make 500X stock (250 mM)
supplier: Sigma-Aldrich (I5879)
- name: Dexamethasone
concentration: 1 µM final
storage: -20°C
preparation: Dissolve in ethanol to make 1000X stock (1 mM)
supplier: Sigma-Aldrich (D4902)
- name: Insulin
concentration: 10 µg/mL final
storage: -20°C
preparation: Dissolve in acidified water (pH 4.5) to make 1000X stock (10 mg/mL)
supplier: Sigma-Aldrich (I6634)
- name: DMEM high glucose
storage: 4°C
supplier: Gibco
- name: Fetal Bovine Serum (FBS)
concentration: 10% final
storage: -20°C (aliquots)
supplier: Gibco
- name: Penicillin-Streptomycin
concentration: 1% final
storage: -20°C
supplier: Gibco
- name: Complete growth medium
composition: DMEM + 10% FBS + 1% Pen-Strep
storage: 4°C
# Equipment required
equipment:
- name: Biosafety cabinet
certification: Class II
- name: CO2 incubator
settings: 37°C, 5% CO2, humidified
- name: Water bath
settings: 37°C
- name: Serological pipettes
sizes: 5 mL, 10 mL, 25 mL
- name: Micropipettes
sizes: P1000, P200, P20
# Protocol steps
steps:
- step: 1
action: "Prepare complete growth medium"
details: "To 500 mL DMEM high glucose, add 50 mL FBS and 5 mL Pen-Strep. Mix well and warm to 37°C before use."
- step: 2
action: "Thaw induction reagent stocks"
details: "Remove IBMX, dexamethasone, and insulin stock solutions from -20°C and thaw at room temperature. Protect from light."
- step: 3
action: "Prepare adipogenic induction medium (AIM)"
details: "To complete growth medium, add IBMX (final 0.5 mM), dexamethasone (final 1 µM), and insulin (final 10 µg/mL). Mix thoroughly but gently by inverting."
- step: 4
action: "Warm media"
details: "Warm both complete growth medium (control) and adipogenic induction medium to 37°C before adding to cells."
- step: 5
action: "Aspirate existing medium from cells"
details: "Using a sterile aspirator, carefully remove all existing medium from the cell culture vessel."
- step: 6
action: "Add fresh medium"
details: "Add appropriate volume of either complete growth medium (control) or adipogenic induction medium to the cells."
- step: 7
action: "Return cells to incubator"
details: "Place cell culture vessels in 37°C, 5% CO2 incubator."
- step: 8
action: "Maintain treatment"
details: "For standard protocol, maintain cells in adipogenic induction medium for 3 days, then switch to insulin-only medium (10 µg/mL insulin in complete medium) for additional 4-11 days."
# Critical parameters
critical_parameters:
- parameter: "Cell confluence"
details: "Cells should be at 100% confluence at the time of induction. Post-confluent cells (2 days after reaching confluence) often yield better differentiation."
- parameter: "Reagent concentration"
details: "IBMX (0.5 mM), dexamethasone (1 µM), and insulin (10 µg/mL) concentrations are critical. Prepare fresh stocks if uncertain about stability."
- parameter: "Media change frequency"
details: "After the initial 3-day induction period, change to insulin-only medium and then change medium every 2-3 days for optimal differentiation."
# Troubleshooting
troubleshooting:
- problem: "Poor differentiation"
solution: "Ensure cells were 100% confluent before induction; check reagent quality and concentrations; extend post-confluent period to 2 days before induction."
- problem: "Cell detachment"
solution: "Handle cells gently during media changes; ensure plate surface is appropriate for adipocyte culture; consider using collagen-coated plates."
- problem: "Contamination"
solution: "Use sterile technique; check medium and reagents for contamination; consider adding additional antibiotics."
# Safety considerations
safety:
ppe: "Lab coat, gloves, and eye protection required"
hazards: "DMSO (IBMX solvent) can enhance skin penetration of other chemicals; dexamethasone is a synthetic glucocorticoid with potential health effects."
disposal: "Dispose of media and solutions according to institutional guidelines for biological waste."
# Expected outcomes
expected_outcomes:
- outcome: "3T3-L1 cells should begin showing lipid droplet formation within 3-5 days"
- outcome: "Maximum differentiation typically reached by day 8-10"
- outcome: "Adipogenic marker genes (PPARγ, C/EBPα, FABP4, etc.) upregulated within 1-2 days"
- outcome: "Early adipogenic transcription factors (C/EBPβ, C/EBPδ) upregulated within hours"
# References
references:
- "Zebisch K, et al. (2012) Protocol for effective differentiation of 3T3-L1 cells to adipocytes. Anal Biochem. 425(1):88-90."
- "Green H, Kehinde O. (1975) An established preadipose cell line and its differentiation in culture II. Factors affecting the adipose conversion. Cell. 5(1):19-27."
- "Rubin CS, et al. (1978) Development of hormone receptors and hormonal responsiveness in vitro. Insulin receptors and insulin sensitivity in the preadipocyte and adipocyte forms of 3T3-L1 cells. J Biol Chem. 253(20):7570-7578."
# Notes
notes: |
- This protocol is optimized for 3T3-L1 cells but can be adapted for other preadipocyte cell lines or primary cells.
- Cell response to adipogenic induction can vary between passages, so consistency in culture conditions is important.
- For experiment termination at 24h post-induction, cells will only show early adipogenic markers (C/EBPβ, C/EBPδ) but not mature adipocyte phenotype.
- YBX1 has been reported to interact with C/EBPα during early adipogenesis as part of transcriptional regulation.
---

View File

@ -1,93 +0,0 @@
#!/usr/bin/env python3
"""
Protocol Format Checker
This script checks which YAML protocol files have been updated with the consistent professional format
and which ones still need to be fixed.
"""
import os
import yaml
import sys
def check_protocol_format(protocol_path):
"""Check if a protocol file has the expected professional format sections."""
try:
with open(protocol_path, 'r') as f:
content = f.read()
# If file is empty or very small, it's probably not formatted properly
if len(content) < 100:
return False
# Check for key sections that indicate our professional format
required_sections = [
"# Protocol metadata",
"# Materials required",
"# Equipment required",
"# Protocol steps",
"# Critical parameters",
"last_updated:",
"category:",
]
# Check for numbered steps format
step_format = "step: "
# Count how many required sections are present
section_count = 0
has_step_format = False
for section in required_sections:
if section in content:
section_count += 1
if step_format in content:
has_step_format = True
# If it has most of the sections and the step format, consider it updated
return section_count >= 5 and has_step_format
except Exception as e:
print(f"Error checking {protocol_path}: {e}")
return False
def main():
"""Main function to check all protocol files."""
protocol_dir = os.path.dirname(os.path.abspath(__file__))
# Get all YAML files in the protocols directory
protocol_files = []
for root, _, files in os.walk(protocol_dir):
for file in files:
if file.endswith('.yaml'):
protocol_files.append(os.path.join(root, file))
# Check each protocol file
updated = []
need_update = []
for protocol in protocol_files:
is_updated = check_protocol_format(protocol)
file_name = os.path.basename(protocol)
if is_updated:
updated.append(file_name)
else:
need_update.append(file_name)
# Print results as a checklist
print("\n===== PROTOCOL FORMATTING CHECKLIST =====")
print(f"Total protocols: {len(protocol_files)}")
print(f"Updated protocols: {len(updated)}")
print(f"Protocols needing update: {len(need_update)}\n")
print("UPDATED PROTOCOLS:")
for i, protocol in enumerate(sorted(updated), 1):
print(f"{i}. {protocol}")
print("\nPROTOCOLS NEEDING UPDATE:")
for i, protocol in enumerate(sorted(need_update), 1):
print(f"{i}. {protocol}")
if __name__ == "__main__":
main()