From ce2f871ebb1f5cf69cee9cf43a3be119b60d0ab7 Mon Sep 17 00:00:00 2001
From: DIY Smart Code <thomas@thirty3.de>
Date: Fri, 12 Sep 2025 17:07:22 +0200
Subject: [PATCH] Fix sitemap URL detection to require .xml extension (#611)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Resolves issue where URLs containing 'sitemap' in path (like
https://nx.dev/see-also/sitemap) were incorrectly treated as XML
sitemaps, causing XML parsing errors.

- Changed detection to require both .xml extension AND 'sitemap' in path
- Fixes XML parsing error: "not well-formed (invalid token)"
- Maintains compatibility with existing test cases
- Now correctly identifies only actual XML sitemap files

Fixes #607

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-authored-by: Claude <noreply@anthropic.com>
---
 python/src/server/services/crawling/helpers/url_handler.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py
index 33c75c5..97a9c5a 100644
--- a/python/src/server/services/crawling/helpers/url_handler.py
+++ b/python/src/server/services/crawling/helpers/url_handler.py
@@ -29,7 +29,10 @@ class URLHandler:
             True if URL is a sitemap, False otherwise
         """
         try:
-            return url.endswith("sitemap.xml") or "sitemap" in urlparse(url).path
+            parsed = urlparse(url)
+            path = parsed.path.lower()
+            # Only match URLs that end with .xml and contain sitemap in the filename
+            return path.endswith(".xml") and "sitemap" in path
         except Exception as e:
             logger.warning(f"Error checking if URL is sitemap: {e}")
             return False