add honeycomb connection (#56)

This commit is contained in:
Arnav Agrawal 2025-03-23 17:50:18 -04:00 committed by GitHub
parent 8712bb49e0
commit dc24a918a1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 490 additions and 33 deletions

View File

@ -90,6 +90,19 @@ class Settings(BaseSettings):
# Colpali configuration
ENABLE_COLPALI: bool
# Telemetry configuration
TELEMETRY_ENABLED: bool = True
HONEYCOMB_ENABLED: bool = True
HONEYCOMB_ENDPOINT: str = "https://api.honeycomb.io"
HONEYCOMB_PROXY_ENDPOINT: str = "https://otel-proxy.onrender.com/"
SERVICE_NAME: str = "databridge-core"
OTLP_TIMEOUT: int = 10
OTLP_MAX_RETRIES: int = 3
OTLP_RETRY_DELAY: int = 1
OTLP_MAX_EXPORT_BATCH_SIZE: int = 512
OTLP_SCHEDULE_DELAY_MILLIS: int = 5000
OTLP_MAX_QUEUE_SIZE: int = 2048
@lru_cache()
@ -286,6 +299,22 @@ def get_settings() -> Settings:
"GRAPH_MODEL": config["graph"]["model_name"],
}
# load telemetry config
telemetry_config = {}
if "telemetry" in config:
telemetry_config = {
"TELEMETRY_ENABLED": config["telemetry"].get("enabled", True),
"HONEYCOMB_ENABLED": config["telemetry"].get("honeycomb_enabled", True),
"HONEYCOMB_ENDPOINT": config["telemetry"].get("honeycomb_endpoint", "https://api.honeycomb.io"),
"SERVICE_NAME": config["telemetry"].get("service_name", "databridge-core"),
"OTLP_TIMEOUT": config["telemetry"].get("otlp_timeout", 10),
"OTLP_MAX_RETRIES": config["telemetry"].get("otlp_max_retries", 3),
"OTLP_RETRY_DELAY": config["telemetry"].get("otlp_retry_delay", 1),
"OTLP_MAX_EXPORT_BATCH_SIZE": config["telemetry"].get("otlp_max_export_batch_size", 512),
"OTLP_SCHEDULE_DELAY_MILLIS": config["telemetry"].get("otlp_schedule_delay_millis", 5000),
"OTLP_MAX_QUEUE_SIZE": config["telemetry"].get("otlp_max_queue_size", 2048),
}
settings_dict = dict(ChainMap(
api_config,
auth_config,
@ -299,6 +328,7 @@ def get_settings() -> Settings:
rules_config,
databridge_config,
graph_config,
telemetry_config,
))
return Settings(**settings_dict)

View File

@ -8,6 +8,11 @@ from contextlib import asynccontextmanager
import os
import json
from pathlib import Path
import uuid
import hashlib
import logging
from core.config import get_settings
from opentelemetry import trace, metrics
from opentelemetry.sdk.trace import TracerProvider
@ -21,8 +26,68 @@ from opentelemetry.sdk.metrics.export import (
AggregationTemporality,
MetricsData,
)
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import OTLPMetricExporter
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
import requests
from urllib3.exceptions import ProtocolError, ReadTimeoutError
# Get settings from config
settings = get_settings()
# Telemetry configuration - use settings directly from TOML
TELEMETRY_ENABLED = settings.TELEMETRY_ENABLED
HONEYCOMB_ENABLED = settings.HONEYCOMB_ENABLED
# Honeycomb configuration - using proxy to avoid exposing API key in code
# Default to localhost:8080 for the proxy, but allow override from settings
HONEYCOMB_PROXY_ENDPOINT = getattr(settings, "HONEYCOMB_PROXY_ENDPOINT", "http://localhost:8080")
SERVICE_NAME = settings.SERVICE_NAME
# Headers for OTLP - no API key needed as the proxy will add it
OTLP_HEADERS = {
"Content-Type": "application/x-protobuf"
}
# Configure timeouts and retries directly from TOML config
OTLP_TIMEOUT = settings.OTLP_TIMEOUT
OTLP_MAX_RETRIES = settings.OTLP_MAX_RETRIES
OTLP_RETRY_DELAY = settings.OTLP_RETRY_DELAY
OTLP_MAX_EXPORT_BATCH_SIZE = settings.OTLP_MAX_EXPORT_BATCH_SIZE
OTLP_SCHEDULE_DELAY_MILLIS = settings.OTLP_SCHEDULE_DELAY_MILLIS
OTLP_MAX_QUEUE_SIZE = settings.OTLP_MAX_QUEUE_SIZE
# OTLP endpoints - using our proxy instead of direct Honeycomb connection
OTLP_TRACES_ENDPOINT = f"{HONEYCOMB_PROXY_ENDPOINT}/v1/traces"
OTLP_METRICS_ENDPOINT = f"{HONEYCOMB_PROXY_ENDPOINT}/v1/metrics"
# Enable debug logging for OpenTelemetry
os.environ["OTEL_PYTHON_LOGGING_LEVEL"] = "INFO" # Changed from DEBUG to reduce verbosity
# Add export protocol setting if not already set
if not os.getenv("OTEL_EXPORTER_OTLP_PROTOCOL"):
os.environ["OTEL_EXPORTER_OTLP_PROTOCOL"] = "http/protobuf"
def get_installation_id() -> str:
"""Generate or retrieve a unique anonymous installation ID."""
id_file = Path.home() / ".databridge" / "installation_id"
id_file.parent.mkdir(parents=True, exist_ok=True)
if id_file.exists():
return id_file.read_text().strip()
# Generate a new installation ID
# We hash the machine-id (if available) or a random UUID
machine_id_file = Path("/etc/machine-id")
if machine_id_file.exists():
machine_id = machine_id_file.read_text().strip()
else:
machine_id = str(uuid.uuid4())
# Hash the machine ID to make it anonymous
installation_id = hashlib.sha256(machine_id.encode()).hexdigest()[:32]
# Save it for future use
id_file.write_text(installation_id)
return installation_id
class FileSpanExporter:
@ -139,6 +204,127 @@ class FileMetricExporter(MetricExporter):
return {}
class RetryingOTLPMetricExporter(MetricExporter):
"""A wrapper around OTLPMetricExporter that adds better retry logic."""
def __init__(self, endpoint, headers=None, timeout=10):
self.exporter = OTLPMetricExporter(
endpoint=endpoint,
headers=headers,
timeout=timeout
)
self.max_retries = OTLP_MAX_RETRIES
self.retry_delay = OTLP_RETRY_DELAY
self.logger = logging.getLogger(__name__)
super().__init__()
def export(self, metrics_data, **kwargs):
"""Export metrics with retry logic for handling connection issues."""
retries = 0
last_exception = None
while retries <= self.max_retries:
try:
return self.exporter.export(metrics_data, **kwargs)
except (requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
ProtocolError,
ReadTimeoutError) as e:
last_exception = e
retries += 1
if retries <= self.max_retries:
# Use exponential backoff
delay = self.retry_delay * (2 ** (retries - 1))
self.logger.warning(
f"Honeycomb export attempt {retries} failed: {str(e)}. "
f"Retrying in {delay}s..."
)
time.sleep(delay)
else:
self.logger.error(
f"Failed to export to Honeycomb after {retries} attempts: {str(e)}"
)
except Exception as e:
# For non-connection errors, don't retry
self.logger.error(f"Unexpected error exporting to Honeycomb: {str(e)}")
return False
# If we get here, all retries failed
return False
def shutdown(self, timeout_millis=30000, **kwargs):
"""Shutdown the exporter."""
return self.exporter.shutdown(timeout_millis, **kwargs)
def force_flush(self, timeout_millis=10000):
"""Force flush the exporter."""
return self.exporter.force_flush(timeout_millis)
def _preferred_temporality(self):
"""Returns the preferred temporality."""
return self.exporter._preferred_temporality()
class RetryingOTLPSpanExporter:
"""A wrapper around OTLPSpanExporter that adds better retry logic."""
def __init__(self, endpoint, headers=None, timeout=10):
self.exporter = OTLPSpanExporter(
endpoint=endpoint,
headers=headers,
timeout=timeout
)
self.max_retries = OTLP_MAX_RETRIES
self.retry_delay = OTLP_RETRY_DELAY
self.logger = logging.getLogger(__name__)
def export(self, spans):
"""Export spans with retry logic for handling connection issues."""
retries = 0
while retries <= self.max_retries:
try:
return self.exporter.export(spans)
except (requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
ProtocolError,
ReadTimeoutError) as e:
retries += 1
if retries <= self.max_retries:
# Use exponential backoff
delay = self.retry_delay * (2 ** (retries - 1))
self.logger.warning(
f"Honeycomb trace export attempt {retries} failed: {str(e)}. "
f"Retrying in {delay}s..."
)
time.sleep(delay)
else:
self.logger.error(
f"Failed to export traces to Honeycomb after {retries} attempts: {str(e)}"
)
except Exception as e:
# For non-connection errors, don't retry
self.logger.error(f"Unexpected error exporting traces to Honeycomb: {str(e)}")
return False
# If we get here, all retries failed
return False
def shutdown(self):
"""Shutdown the exporter."""
return self.exporter.shutdown()
def force_flush(self):
"""Force flush the exporter."""
try:
return self.exporter.force_flush()
except Exception as e:
self.logger.error(f"Error during trace force_flush: {str(e)}")
return False
@dataclass
class UsageRecord:
timestamp: datetime
@ -163,40 +349,87 @@ class TelemetryService:
return cls._instance
def _initialize(self):
if not TELEMETRY_ENABLED:
return
self._usage_records: List[UsageRecord] = []
self._user_totals = defaultdict(lambda: defaultdict(int))
self._lock = threading.Lock()
self._installation_id = get_installation_id()
# Initialize OpenTelemetry
resource = Resource.create({"service.name": "databridge-core"})
# Initialize OpenTelemetry with more detailed resource attributes
resource = Resource.create({
"service.name": SERVICE_NAME,
"service.version": os.getenv("DATABRIDGE_VERSION", "unknown"),
"installation.id": self._installation_id,
"environment": os.getenv("ENVIRONMENT", "production"),
"telemetry.sdk.name": "opentelemetry",
"telemetry.sdk.language": "python",
"telemetry.sdk.version": "1.0.0"
})
# Create logs directory
# Initialize tracing with both file and OTLP exporters
tracer_provider = TracerProvider(resource=resource)
# Always use both exporters
log_dir = Path("logs/telemetry")
log_dir.mkdir(parents=True, exist_ok=True)
# Add file exporter for local logging
file_span_processor = BatchSpanProcessor(FileSpanExporter(str(log_dir)))
tracer_provider.add_span_processor(file_span_processor)
# Add Honeycomb OTLP exporter with retry logic
if HONEYCOMB_ENABLED:
# Create BatchSpanProcessor with improved configuration
otlp_span_processor = BatchSpanProcessor(
RetryingOTLPSpanExporter(
endpoint=OTLP_TRACES_ENDPOINT,
headers=OTLP_HEADERS,
timeout=OTLP_TIMEOUT,
),
# Configure batch processing settings
max_queue_size=OTLP_MAX_QUEUE_SIZE,
max_export_batch_size=OTLP_MAX_EXPORT_BATCH_SIZE,
schedule_delay_millis=OTLP_SCHEDULE_DELAY_MILLIS,
)
tracer_provider.add_span_processor(otlp_span_processor)
# Initialize tracing
tracer_provider = TracerProvider(resource=resource)
# Use file exporter for local development
if os.getenv("ENVIRONMENT", "development") == "development":
span_processor = BatchSpanProcessor(FileSpanExporter(str(log_dir)))
else:
span_processor = BatchSpanProcessor(OTLPSpanExporter())
tracer_provider.add_span_processor(span_processor)
trace.set_tracer_provider(tracer_provider)
self.tracer = trace.get_tracer(__name__)
# Initialize metrics
if os.getenv("ENVIRONMENT", "development") == "development":
metric_reader = PeriodicExportingMetricReader(
# Initialize metrics with both exporters
metric_readers = [
# Local file metrics reader
PeriodicExportingMetricReader(
FileMetricExporter(str(log_dir)),
export_interval_millis=60000, # Export every minute
)
else:
metric_reader = PeriodicExportingMetricReader(OTLPMetricExporter())
),
]
meter_provider = MeterProvider(resource=resource, metric_readers=[metric_reader])
# Add Honeycomb metrics reader if API key is available
if HONEYCOMB_ENABLED:
try:
# Configure the OTLP metric exporter with improved error handling
otlp_metric_exporter = RetryingOTLPMetricExporter(
endpoint=OTLP_METRICS_ENDPOINT,
headers=OTLP_HEADERS,
timeout=OTLP_TIMEOUT,
)
# Configure the metrics reader with improved settings
metric_readers.append(
PeriodicExportingMetricReader(
otlp_metric_exporter,
export_interval_millis=OTLP_SCHEDULE_DELAY_MILLIS,
export_timeout_millis=OTLP_TIMEOUT * 1000,
)
)
print(f"Successfully configured Honeycomb metrics exporter to {OTLP_METRICS_ENDPOINT}")
except Exception as e:
print(f"Failed to configure Honeycomb metrics exporter: {str(e)}")
meter_provider = MeterProvider(resource=resource, metric_readers=metric_readers)
metrics.set_meter_provider(meter_provider)
self.meter = metrics.get_meter(__name__)
@ -224,18 +457,34 @@ class TelemetryService:
metadata: Optional[Dict[str, Any]] = None,
):
"""
Context manager for tracking operations with both usage metrics and OpenTelemetry
Context manager for tracking operations with both usage metrics and OpenTelemetry.
The user_id is hashed to ensure anonymity.
"""
if not TELEMETRY_ENABLED:
yield None
return
start_time = time.time()
status = "success"
current_span = trace.get_current_span()
# Hash the user ID for anonymity
hashed_user_id = hashlib.sha256(user_id.encode()).hexdigest()[:16]
try:
# Add operation attributes to the current span
current_span.set_attribute("operation.type", operation_type)
current_span.set_attribute("user.id", user_id)
current_span.set_attribute("user.id", hashed_user_id)
if metadata:
for key, value in metadata.items():
# Create a copy of metadata to avoid modifying the original
metadata_copy = metadata.copy()
# Remove the nested 'metadata' field completely if it exists
if 'metadata' in metadata_copy:
del metadata_copy['metadata']
# Set attributes for all remaining metadata fields
for key, value in metadata_copy.items():
current_span.set_attribute(f"metadata.{key}", str(value))
yield current_span
@ -249,30 +498,47 @@ class TelemetryService:
duration = (time.time() - start_time) * 1000 # Convert to milliseconds
# Record metrics
self.operation_counter.add(1, {"operation": operation_type, "status": status})
attributes = {
"operation": operation_type,
"status": status,
"installation_id": self._installation_id
}
self.operation_counter.add(1, attributes)
if tokens_used > 0:
self.token_counter.add(tokens_used, {"operation": operation_type})
self.operation_duration.record(duration, {"operation": operation_type})
self.token_counter.add(tokens_used, attributes)
self.operation_duration.record(duration, attributes)
# Record usage
# Create a sanitized copy of metadata for the usage record
sanitized_metadata = None
if metadata:
sanitized_metadata = metadata.copy()
# Remove the nested 'metadata' field completely if it exists
if 'metadata' in sanitized_metadata:
del sanitized_metadata['metadata']
record = UsageRecord(
timestamp=datetime.now(),
operation_type=operation_type,
tokens_used=tokens_used,
user_id=user_id,
user_id=hashed_user_id,
duration_ms=duration,
status=status,
metadata=metadata,
metadata=sanitized_metadata,
)
with self._lock:
self._usage_records.append(record)
self._user_totals[user_id][operation_type] += tokens_used
self._user_totals[hashed_user_id][operation_type] += tokens_used
def get_user_usage(self, user_id: str) -> Dict[str, int]:
"""Get usage statistics for a user."""
if not TELEMETRY_ENABLED:
return {}
hashed_user_id = hashlib.sha256(user_id.encode()).hexdigest()[:16]
with self._lock:
return dict(self._user_totals[user_id])
return dict(self._user_totals[hashed_user_id])
def get_recent_usage(
self,
@ -282,12 +548,16 @@ class TelemetryService:
status: Optional[str] = None,
) -> List[UsageRecord]:
"""Get recent usage records with optional filtering."""
if not TELEMETRY_ENABLED:
return []
with self._lock:
records = self._usage_records.copy()
# Apply filters
if user_id:
records = [r for r in records if r.user_id == user_id]
hashed_user_id = hashlib.sha256(user_id.encode()).hexdigest()[:16]
records = [r for r in records if r.user_id == hashed_user_id]
if operation_type:
records = [r for r in records if r.operation_type == operation_type]
if since:

View File

@ -0,0 +1,73 @@
#!/usr/bin/env python
"""
Test script to verify that telemetry data is being properly sent through the proxy.
This script will generate a test span and metric and send it to Honeycomb via the proxy.
"""
import time
import logging
import uuid
import asyncio
from datetime import datetime
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger("telemetry-test")
# Import the telemetry service
from core.services.telemetry import TelemetryService
from core.config import get_settings
async def run_test():
"""Run a telemetry test to verify proxy functionality."""
settings = get_settings()
# Log the current configuration
logger.info(f"Telemetry enabled: {settings.TELEMETRY_ENABLED}")
logger.info(f"Honeycomb enabled: {settings.HONEYCOMB_ENABLED}")
logger.info(f"Honeycomb proxy endpoint: {settings.HONEYCOMB_PROXY_ENDPOINT}")
# Get the telemetry service
telemetry_service = TelemetryService()
# Generate a unique user ID for testing
test_user_id = f"test-user-{uuid.uuid4()}"
# Track a test operation
logger.info(f"Tracking test operation for user {test_user_id}")
# Use the telemetry service to track an operation (with async context manager)
async with telemetry_service.track_operation(
operation_type="test_proxy",
user_id=test_user_id,
tokens_used=100,
metadata={
"test": True,
"timestamp": datetime.now().isoformat(),
"proxy_test": "Honeycomb proxy test"
}
) as span:
# Simulate some work
logger.info("Performing test operation...")
await asyncio.sleep(2)
# Add some attributes to the span
span.set_attribute("test.proxy", True)
span.set_attribute("test.timestamp", time.time())
# Log a message
logger.info("Test operation completed successfully")
# Wait a moment for the telemetry data to be sent
logger.info("Waiting for telemetry data to be sent...")
await asyncio.sleep(5)
logger.info("Test completed. Check Honeycomb for the telemetry data.")
logger.info(f"Look for operation_type='test_proxy' and user_id='{test_user_id}'")
def main():
"""Run the async test function."""
asyncio.run(run_test())
if __name__ == "__main__":
main()

View File

@ -108,3 +108,16 @@ model_name = "llama3.2"
# [graph]
# provider = "openai"
# model_name = "gpt-4o-mini"
[telemetry]
enabled = true
honeycomb_enabled = true
honeycomb_endpoint = "https://api.honeycomb.io"
honeycomb_proxy_endpoint = "https://otel-proxy.onrender.com"
service_name = "databridge-core"
otlp_timeout = 10
otlp_max_retries = 3
otlp_retry_delay = 1
otlp_max_export_batch_size = 512
otlp_schedule_delay_millis = 5000
otlp_max_queue_size = 2048

71
docs/telemetry.md Normal file
View File

@ -0,0 +1,71 @@
# DataBridge Telemetry
DataBridge includes an anonymous telemetry system to help us understand how the library is being used and to improve its functionality. We take privacy very seriously and ensure that no personally identifiable information (PII) is ever collected.
## What We Collect
The following anonymous data is collected:
- Installation ID (a randomly generated identifier, hashed from machine ID)
- Operation types (e.g., document ingestion, queries, retrievals)
- Operation durations
- Token usage statistics
- Error rates and types
- Basic metadata about operations (excluding any PII)
We explicitly DO NOT collect:
- User identifiers (all user IDs are hashed)
- File contents or queries
- API keys or credentials
- Personal information
- IP addresses or location data
- Any metadata fields containing sensitive information
## How to Opt Out
Telemetry is enabled by default but can be disabled by setting the environment variable:
```bash
export DATABRIDGE_TELEMETRY_ENABLED=0
```
Or in your Python code:
```python
import os
os.environ["DATABRIDGE_TELEMETRY_ENABLED"] = "0"
```
## Data Storage and Retention
All telemetry data is:
- Stored securely
- Automatically anonymized before transmission
- Used only for improving DataBridge
- Never shared with third parties
- Retained for a maximum of 90 days
## Technical Details
The telemetry system uses OpenTelemetry to collect metrics and traces. In development mode, data is stored locally in `logs/telemetry/`. In production, data is sent to our secure collector endpoint.
You can inspect the telemetry data being collected by looking at the local log files in development mode:
- `logs/telemetry/traces.log`
- `logs/telemetry/metrics.log`
## Why We Collect Telemetry
This data helps us:
1. Understand how DataBridge is used in real-world scenarios
2. Identify performance bottlenecks
3. Prioritize features and improvements
4. Fix bugs faster
5. Make data-driven decisions about the project's direction
## Questions or Concerns
If you have any questions or concerns about our telemetry collection, please:
1. Open an issue on our GitHub repository
2. Email us at privacy@databridge.dev
3. Review our telemetry implementation in `core/services/telemetry.py`