"""
Test that code extraction uses the correct source_id.

This test ensures that the fix for using hash-based source_ids
instead of domain-based source_ids works correctly.
"""

import pytest
from unittest.mock import Mock, AsyncMock, patch, MagicMock
from src.server.services.crawling.code_extraction_service import CodeExtractionService
from src.server.services.crawling.document_storage_operations import DocumentStorageOperations


class TestCodeExtractionSourceId:
    """Test that code extraction properly uses the provided source_id."""

    @pytest.mark.asyncio
    async def test_code_extraction_uses_provided_source_id(self):
        """Test that code extraction uses the hash-based source_id, not domain."""
        # Create mock supabase client
        mock_supabase = Mock()
        mock_supabase.table.return_value.select.return_value.eq.return_value.execute.return_value.data = []
        
        # Create service instance
        code_service = CodeExtractionService(mock_supabase)
        
        # Track what gets passed to the internal extraction method
        extracted_blocks = []
        
        async def mock_extract_blocks(crawl_results, source_id, progress_callback=None, start=0, end=100):
            # Simulate finding code blocks and verify source_id is passed correctly
            for doc in crawl_results:
                extracted_blocks.append({
                    "block": {"code": "print('hello')", "language": "python"},
                    "source_url": doc["url"],
                    "source_id": source_id  # This should be the provided source_id
                })
            return extracted_blocks
        
        code_service._extract_code_blocks_from_documents = mock_extract_blocks
        code_service._generate_code_summaries = AsyncMock(return_value=[{"summary": "Test code"}])
        code_service._prepare_code_examples_for_storage = Mock(return_value=[
            {"source_id": extracted_blocks[0]["source_id"] if extracted_blocks else None}
        ])
        code_service._store_code_examples = AsyncMock(return_value=1)
        
        # Test data
        crawl_results = [
            {
                "url": "https://docs.mem0.ai/example",
                "markdown": "```python\nprint('hello')\n```"
            }
        ]
        
        url_to_full_document = {
            "https://docs.mem0.ai/example": "Full content with code"
        }
        
        # The correct hash-based source_id
        correct_source_id = "393224e227ba92eb"
        
        # Call the method with the correct source_id
        result = await code_service.extract_and_store_code_examples(
            crawl_results,
            url_to_full_document,
            correct_source_id,
            None,
            0,
            100
        )
        
        # Verify that extracted blocks use the correct source_id
        assert len(extracted_blocks) > 0, "Should have extracted at least one code block"
        
        for block in extracted_blocks:
            # Check that it's using the hash-based source_id, not the domain
            assert block["source_id"] == correct_source_id, \
                f"Should use hash-based source_id '{correct_source_id}', not domain"
            assert block["source_id"] != "docs.mem0.ai", \
                "Should NOT use domain-based source_id"

    @pytest.mark.asyncio
    async def test_document_storage_passes_source_id(self):
        """Test that DocumentStorageOperations passes source_id to code extraction."""
        # Create mock supabase client
        mock_supabase = Mock()
        
        # Create DocumentStorageOperations instance
        doc_storage = DocumentStorageOperations(mock_supabase)
        
        # Mock the code extraction service
        mock_extract = AsyncMock(return_value=5)
        doc_storage.code_extraction_service.extract_and_store_code_examples = mock_extract
        
        # Test data
        crawl_results = [{"url": "https://example.com", "markdown": "test"}]
        url_to_full_document = {"https://example.com": "test content"}
        source_id = "abc123def456"
        
        # Call the wrapper method
        result = await doc_storage.extract_and_store_code_examples(
            crawl_results,
            url_to_full_document,
            source_id,
            None,
            0,
            100
        )
        
        # Verify the correct source_id was passed
        mock_extract.assert_called_once_with(
            crawl_results,
            url_to_full_document,
            source_id,  # This should be the third argument
            None,
            0,
            100
        )
        assert result == 5

    @pytest.mark.asyncio
    async def test_no_domain_extraction_from_url(self):
        """Test that we're NOT extracting domain from URL anymore."""
        mock_supabase = Mock()
        mock_supabase.table.return_value.select.return_value.eq.return_value.execute.return_value.data = []
        
        code_service = CodeExtractionService(mock_supabase)
        
        # Patch internal methods
        code_service._get_setting = AsyncMock(return_value=True)
        
        # Create a mock that will track what source_id is used
        source_ids_seen = []
        
        original_extract = code_service._extract_code_blocks_from_documents
        async def track_source_id(crawl_results, source_id, progress_callback=None, start=0, end=100):
            source_ids_seen.append(source_id)
            return []  # Return empty list to skip further processing
        
        code_service._extract_code_blocks_from_documents = track_source_id
        
        # Test with various URLs that would produce different domains
        test_cases = [
            ("https://github.com/example/repo", "github123abc"),
            ("https://docs.python.org/guide", "python456def"),
            ("https://api.openai.com/v1", "openai789ghi"),
        ]
        
        for url, expected_source_id in test_cases:
            source_ids_seen.clear()
            
            crawl_results = [{"url": url, "markdown": "# Test"}]
            url_to_full_document = {url: "Full content"}
            
            await code_service.extract_and_store_code_examples(
                crawl_results,
                url_to_full_document,
                expected_source_id,
                None,
                0,
                100
            )
            
            # Verify the provided source_id was used
            assert len(source_ids_seen) == 1
            assert source_ids_seen[0] == expected_source_id
            # Verify it's NOT the domain
            assert "github.com" not in source_ids_seen[0]
            assert "python.org" not in source_ids_seen[0]
            assert "openai.com" not in source_ids_seen[0]

    def test_urlparse_not_imported(self):
        """Test that urlparse is not imported in code_extraction_service."""
        import src.server.services.crawling.code_extraction_service as module
        
        # Check that urlparse is not in the module's namespace
        assert not hasattr(module, 'urlparse'), \
            "urlparse should not be imported in code_extraction_service"
        
        # Check the module's actual imports
        import inspect
        source = inspect.getsource(module)
        assert "from urllib.parse import urlparse" not in source, \
            "Should not import urlparse since we don't extract domain from URL anymore"