Archon/python/tests/test_url_canonicalization.py

"""
Test URL canonicalization in source ID generation.

This test ensures that URLs are properly normalized before hashing
to prevent duplicate sources from URL variations.
"""

import pytest
from src.server.services.crawling.helpers.url_handler import URLHandler


class TestURLCanonicalization:
    """Test that URL canonicalization works correctly for source ID generation."""

    def test_trailing_slash_normalization(self):
        """Test that trailing slashes are handled consistently."""
        handler = URLHandler()

        # These should generate the same ID
        url1 = "https://example.com/path"
        url2 = "https://example.com/path/"

        id1 = handler.generate_unique_source_id(url1)
        id2 = handler.generate_unique_source_id(url2)

        assert id1 == id2, "URLs with/without trailing slash should generate same ID"

        # Root path should keep its slash
        root1 = "https://example.com"
        root2 = "https://example.com/"

        root_id1 = handler.generate_unique_source_id(root1)
        root_id2 = handler.generate_unique_source_id(root2)

        # These should be the same (both normalize to https://example.com/)
        assert root_id1 == root_id2, "Root URLs should normalize consistently"

    def test_fragment_removal(self):
        """Test that URL fragments are removed."""
        handler = URLHandler()

        urls = [
            "https://example.com/page",
            "https://example.com/page#section1",
            "https://example.com/page#section2",
            "https://example.com/page#",
        ]

        ids = [handler.generate_unique_source_id(url) for url in urls]

        # All should generate the same ID
        assert len(set(ids)) == 1, "URLs with different fragments should generate same ID"

    def test_tracking_param_removal(self):
        """Test that tracking parameters are removed."""
        handler = URLHandler()

        # URL without tracking params
        clean_url = "https://example.com/page?important=value"

        # URLs with various tracking params
        tracked_urls = [
            "https://example.com/page?important=value&utm_source=google",
            "https://example.com/page?utm_medium=email&important=value",
            "https://example.com/page?important=value&fbclid=abc123",
            "https://example.com/page?gclid=xyz&important=value&utm_campaign=test",
            "https://example.com/page?important=value&ref=homepage",
            "https://example.com/page?source=newsletter&important=value",
        ]

        clean_id = handler.generate_unique_source_id(clean_url)
        tracked_ids = [handler.generate_unique_source_id(url) for url in tracked_urls]

        # All tracked URLs should generate the same ID as the clean URL
        for tracked_id in tracked_ids:
            assert tracked_id == clean_id, "URLs with tracking params should match clean URL"

    def test_query_param_sorting(self):
        """Test that query parameters are sorted for consistency."""
        handler = URLHandler()

        urls = [
            "https://example.com/page?a=1&b=2&c=3",
            "https://example.com/page?c=3&a=1&b=2",
            "https://example.com/page?b=2&c=3&a=1",
        ]

        ids = [handler.generate_unique_source_id(url) for url in urls]

        # All should generate the same ID
        assert len(set(ids)) == 1, "URLs with reordered query params should generate same ID"

    def test_default_port_removal(self):
        """Test that default ports are removed."""
        handler = URLHandler()

        # HTTP default port (80)
        http_urls = [
            "http://example.com/page",
            "http://example.com:80/page",
        ]

        http_ids = [handler.generate_unique_source_id(url) for url in http_urls]
        assert len(set(http_ids)) == 1, "HTTP URLs with/without :80 should generate same ID"

        # HTTPS default port (443)
        https_urls = [
            "https://example.com/page",
            "https://example.com:443/page",
        ]

        https_ids = [handler.generate_unique_source_id(url) for url in https_urls]
        assert len(set(https_ids)) == 1, "HTTPS URLs with/without :443 should generate same ID"

        # Non-default ports should be preserved
        url1 = "https://example.com:8080/page"
        url2 = "https://example.com:9090/page"

        id1 = handler.generate_unique_source_id(url1)
        id2 = handler.generate_unique_source_id(url2)

        assert id1 != id2, "URLs with different non-default ports should generate different IDs"

    def test_case_normalization(self):
        """Test that scheme and domain are lowercased."""
        handler = URLHandler()

        urls = [
            "https://example.com/Path/To/Page",
            "HTTPS://EXAMPLE.COM/Path/To/Page",
            "https://Example.Com/Path/To/Page",
            "HTTPs://example.COM/Path/To/Page",
        ]

        ids = [handler.generate_unique_source_id(url) for url in urls]

        # All should generate the same ID (path case is preserved)
        assert len(set(ids)) == 1, "URLs with different case in scheme/domain should generate same ID"

        # But different paths should generate different IDs
        path_urls = [
            "https://example.com/path",
            "https://example.com/Path",
            "https://example.com/PATH",
        ]

        path_ids = [handler.generate_unique_source_id(url) for url in path_urls]

        # These should be different (path case matters)
        assert len(set(path_ids)) == 3, "URLs with different path case should generate different IDs"

    def test_complex_canonicalization(self):
        """Test complex URL with multiple normalizations needed."""
        handler = URLHandler()

        urls = [
            "https://example.com/page",
            "HTTPS://EXAMPLE.COM:443/page/",
            "https://Example.com/page#section",
            "https://example.com/page/?utm_source=test",
            "https://example.com:443/page?utm_campaign=abc#footer",
        ]

        ids = [handler.generate_unique_source_id(url) for url in urls]

        # All should generate the same ID
        assert len(set(ids)) == 1, "Complex URLs should normalize to same ID"

    def test_edge_cases(self):
        """Test edge cases and error handling."""
        handler = URLHandler()

        # Empty URL
        empty_id = handler.generate_unique_source_id("")
        assert len(empty_id) == 16, "Empty URL should still generate valid ID"

        # Invalid URL
        invalid_id = handler.generate_unique_source_id("not-a-url")
        assert len(invalid_id) == 16, "Invalid URL should still generate valid ID"

        # URL with special characters
        special_url = "https://example.com/page?key=value%20with%20spaces"
        special_id = handler.generate_unique_source_id(special_url)
        assert len(special_id) == 16, "URL with encoded chars should generate valid ID"

        # Very long URL
        long_url = "https://example.com/" + "a" * 1000
        long_id = handler.generate_unique_source_id(long_url)
        assert len(long_id) == 16, "Long URL should generate valid ID"

    def test_preserves_important_params(self):
        """Test that non-tracking params are preserved."""
        handler = URLHandler()

        # These have different important params, should be different
        url1 = "https://api.example.com/v1/users?page=1"
        url2 = "https://api.example.com/v1/users?page=2"

        id1 = handler.generate_unique_source_id(url1)
        id2 = handler.generate_unique_source_id(url2)

        assert id1 != id2, "URLs with different important params should generate different IDs"

        # But tracking params should still be removed
        url3 = "https://api.example.com/v1/users?page=1&utm_source=docs"
        id3 = handler.generate_unique_source_id(url3)

        assert id3 == id1, "Adding tracking params shouldn't change ID"

    def test_local_file_paths(self):
        """Test handling of local file paths."""
        handler = URLHandler()

        # File URLs
        file_url = "file:///Users/test/document.pdf"
        file_id = handler.generate_unique_source_id(file_url)
        assert len(file_id) == 16, "File URL should generate valid ID"

        # Relative paths
        relative_path = "../documents/file.txt"
        relative_id = handler.generate_unique_source_id(relative_path)
        assert len(relative_id) == 16, "Relative path should generate valid ID"