#!/usr/bin/env python3
"""
Direct LLM Intelligence Test - No Slack, No Dispatcher overhead.

This tests the RAW LLM+Tools capability without any keyword matching.
"""

import os
import sys
from datetime import datetime
from typing import List, Dict, Any

sys.path.append(os.getcwd())

from dotenv import load_dotenv
from slack_bot.llm.gemini import GeminiLLM
from slack_bot.tools.registry import TOOLS_SCHEMA, TOOL_FUNCTIONS
from health.utils.logging_config import setup_logger

logger = setup_logger(__name__)
load_dotenv()


class LLMTest:
    """Simple test case."""

    def __init__(self, name: str, query: str, expected_tools: List[str], context: List[Dict] = None):
        self.name = name
        self.query = query
        self.expected_tools = expected_tools
        self.context = context or []


# Core test cases
TESTS = [
    LLMTest(
        name="daily_summary",
        query="今天的健康数据怎么样？",
        expected_tools=["get_daily_detailed_stats"]
    ),

    LLMTest(
        name="yesterday_sleep",
        query="昨晚睡眠怎么样",
        expected_tools=["get_daily_detailed_stats"]
    ),

    LLMTest(
        name="hrv_trend",
        query="过去60天的hrv变化",
        expected_tools=["get_metric_history"]
    ),

    LLMTest(
        name="food_simple",
        query="晚上吃了白切鸡、花菜、红烧肉和猪血",
        expected_tools=["log_diet"]
    ),

    LLMTest(
        name="confirmation",
        query="好的，可以记录",
        expected_tools=["log_diet"],
        context=[
            {"role": "user", "content": "中午吃了两个卤蛋和两勺蛋白粉"},
            {"role": "assistant", "content": "需要我为您记录午餐吗？描述:两个卤蛋和两勺蛋白粉"}
        ]
    ),

    LLMTest(
        name="sync",
        query="同步一下garmin数据",
        expected_tools=["sync_garmin"]
    ),

    LLMTest(
        name="activity_analysis",
        query="今早椭圆机运动请深入分析",
        expected_tools=["get_activity_history"]
    ),

    LLMTest(
        name="causal_analysis",
        query="喝酒对我的睡眠有什么影响？",
        expected_tools=["analyze_driver"]
    ),

    LLMTest(
        name="web_search",
        query="搜索一下最新的NAD+研究",
        expected_tools=["search_web"]
    ),
]


def test_llm_intelligence(model_name: str = None):
    """Test raw LLM intelligence with tools."""

    print("\n" + "="*80)
    print("🧠 RAW LLM Intelligence Test (No Keyword Matching)")
    print("="*80)
    print(f"Model: {model_name or os.getenv('GEMINI_MODEL', 'default')}")
    print(f"Proxy: {os.getenv('GEMINI_BASE_URL', 'default')}")
    print(f"Tests: {len(TESTS)}")
    print("="*80)

    # Override model if specified
    if model_name:
        original_model = os.getenv("GEMINI_MODEL")
        os.environ["GEMINI_MODEL"] = model_name

    # Initialize LLM
    llm = GeminiLLM()

    results = []

    for idx, test in enumerate(TESTS, 1):
        print(f"\n{'='*80}")
        print(f"Test {idx}/{len(TESTS)}: {test.name}")
        print(f"Query: {test.query}")
        print(f"Expected: {', '.join(test.expected_tools)}")
        print('='*80)

        try:
            # Call LLM with tools
            response_text, tool_calls = llm.generate_response(
                message=test.query,
                context=test.context,
                tools=TOOLS_SCHEMA
            )

            # Extract tool names
            called_tools = [tc["name"] for tc in (tool_calls or [])]

            # Check success
            success = all(tool in called_tools for tool in test.expected_tools)

            print(f"Tools Called: {called_tools or '[]'}")
            print(f"Response Text: {response_text[:200] if response_text else '(empty)'}...")

            if success:
                status = "✅ PASS"
            elif called_tools:
                status = "⚠️  PARTIAL"
            else:
                status = "❌ FAIL"

            print(f"\nResult: {status}")

            results.append({
                "test": test.name,
                "expected": test.expected_tools,
                "called": called_tools,
                "response": response_text,
                "success": success
            })

        except Exception as e:
            print(f"❌ ERROR: {e}")
            results.append({
                "test": test.name,
                "expected": test.expected_tools,
                "called": [],
                "response": "",
                "success": False,
                "error": str(e)
            })

    # Summary
    print("\n" + "="*80)
    print("📊 SUMMARY")
    print("="*80)

    passed = sum(1 for r in results if r["success"])
    partial = sum(1 for r in results if not r["success"] and r["called"])
    failed = sum(1 for r in results if not r["called"])

    for r in results:
        if r["success"]:
            print(f"✅ {r['test']:<25} → {', '.join(r['called'])}")
        elif r["called"]:
            print(f"⚠️  {r['test']:<25} → Expected: {r['expected']}, Got: {r['called']}")
        else:
            print(f"❌ {r['test']:<25} → No tools (Text only)")

    print("="*80)
    print(f"✅ Passed:  {passed}/{len(TESTS)} ({passed/len(TESTS)*100:.1f}%)")
    print(f"⚠️  Partial: {partial}/{len(TESTS)}")
    print(f"❌ Failed:  {failed}/{len(TESTS)}")
    print("="*80)

    # Analysis
    success_rate = passed / len(TESTS) * 100

    if success_rate >= 80:
        print("\n✅ CONCLUSION: LLM is SMART - No keyword matching needed!")
        print("   Action: Remove SAFETY OVERRIDE from dispatcher.py")
    elif success_rate >= 50:
        print("\n⚠️  CONCLUSION: LLM is DECENT - Some help needed")
        print(f"   Action: Keep minimal fallback for {failed} failed cases")
        print("   OR: Improve tool descriptions / system prompt")
    else:
        print("\n❌ CONCLUSION: LLM performance is POOR")
        print("   Possible causes:")
        print("   1. Wrong model (try gemini-2.0-flash-exp or Claude)")
        print("   2. System prompt over-guidance")
        print("   3. Tool schemas unclear")
        print("   4. Proxy issues")

    return results


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, help="Model to test (e.g., gemini-2.0-flash-exp)")
    args = parser.parse_args()

    test_llm_intelligence(model_name=args.model)
