This guide helps you diagnose and fix common issues when using the ev SDK with daft (our multimodal query engine).

Installation Issues

Package Installation Fails

Authentication Issues

Configuration Problems

Job Execution Issues

Job Submission Problems

Runtime Errors

daft-Specific Issues

Data Processing Errors

Performance Issues

Environment Issues

Dependency Management

Debugging Techniques

Comprehensive Logging

import logging
import time
from datetime import datetime

@job.main
def debug_daft_job(input_path: str):
    # Setup detailed logging
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    
    logger.info(f"Job started at {datetime.now()}")
    logger.info(f"Input path: {input_path}")
    
    start_time = time.time()
    
    try:
        # Log data loading
        logger.info("Loading data with daft...")
        load_start = time.time()
        df = daft.read_parquet(input_path)
        
        # Log schema information
        logger.info(f"Schema: {df.schema}")
        logger.info(f"Data loaded in {time.time() - load_start:.2f}s (lazy)")
        
        # Log row count (triggers execution)
        count_start = time.time()
        row_count = df.count_rows()
        logger.info(f"Row count: {row_count} (computed in {time.time() - count_start:.2f}s)")
        
        # Log processing steps
        logger.info("Applying filters...")
        filter_start = time.time()
        df = df.where(df["status"] == "active")
        active_count = df.count_rows()
        logger.info(f"Active rows: {active_count} (filtered in {time.time() - filter_start:.2f}s)")
        
        # Log memory usage if available
        try:
            import psutil
            memory_mb = psutil.Process().memory_info().rss / 1024 / 1024
            logger.info(f"Memory usage: {memory_mb:.1f} MB")
        except ImportError:
            pass
        
        total_time = time.time() - start_time
        logger.info(f"Job completed successfully in {total_time:.2f}s")
        
        return {
            "success": True,
            "total_rows": row_count,
            "active_rows": active_count,
            "processing_time": total_time
        }
        
    except Exception as e:
        logger.error(f"Job failed after {time.time() - start_time:.2f}s: {e}")
        logger.error(f"Error type: {type(e).__name__}")
        import traceback
        logger.error(f"Traceback: {traceback.format_exc()}")
        raise

Local Testing Strategies

# test_daft_locally.py
def test_daft_pipeline():
    """Test daft operations with small local data."""
    import daft
    
    # Create test data
    test_data = {
        "id": [1, 2, 3, 4, 5],
        "status": ["active", "inactive", "active", "pending", "active"],
        "value": [10.5, 20.0, 30.5, 40.0, 50.5],
        "timestamp": ["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04", "2024-01-05"]
    }
    
    df = daft.from_pydict(test_data)
    
    # Test the same operations as in your job
    result_df = df.where(df["status"] == "active")
    result_df = result_df.with_column("processed", daft.lit(True))
    
    # Collect and inspect
    result = result_df.collect()
    print(f"Test result: {result.to_pydict()}")
    
    # Verify expected behavior
    assert len(result) == 3  # Should have 3 active records
    assert all(result.to_pydict()["processed"])  # All should be marked processed
    
    print("Local test passed!")

if __name__ == "__main__":
    test_daft_pipeline()

Getting Help

Diagnostic Information

When reporting issues, include:
# ev SDK version
pip show ev-sdk

# Python version
python --version

# Operating system
uname -a  # Linux/macOS
# systeminfo | findstr /B /C:"OS Name" /C:"OS Version"  # Windows

# daft version (included with ev-sdk)
python -c "import daft; print(daft.__version__)"

# Environment packages
pip freeze

Error Patterns to Include

When seeking help, provide:
  1. Complete error message including stack trace
  2. Minimal reproducible example with sample data
  3. Environment configuration (packages, versions)
  4. Job parameters and input data characteristics
  5. Expected vs actual behavior

Community Resources

  • Documentation: https://docs.daft.ai
  • GitHub Issues: Report bugs and feature requests
  • Community Forum: Ask questions and share solutions
  • Example Gallery: Browse working examples and patterns
For complex multimodal processing issues or performance optimization questions, consider sharing your specific use case and data characteristics to get more targeted assistance.