From ce2f871ebb1f5cf69cee9cf43a3be119b60d0ab7 Mon Sep 17 00:00:00 2001 From: DIY Smart Code Date: Fri, 12 Sep 2025 17:07:22 +0200 Subject: [PATCH] Fix sitemap URL detection to require .xml extension (#611) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves issue where URLs containing 'sitemap' in path (like https://nx.dev/see-also/sitemap) were incorrectly treated as XML sitemaps, causing XML parsing errors. - Changed detection to require both .xml extension AND 'sitemap' in path - Fixes XML parsing error: "not well-formed (invalid token)" - Maintains compatibility with existing test cases - Now correctly identifies only actual XML sitemap files Fixes #607 🤖 Generated with [Claude Code](https://claude.ai/code) Co-authored-by: Claude --- python/src/server/services/crawling/helpers/url_handler.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index 33c75c5..97a9c5a 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -29,7 +29,10 @@ class URLHandler: True if URL is a sitemap, False otherwise """ try: - return url.endswith("sitemap.xml") or "sitemap" in urlparse(url).path + parsed = urlparse(url) + path = parsed.path.lower() + # Only match URLs that end with .xml and contain sitemap in the filename + return path.endswith(".xml") and "sitemap" in path except Exception as e: logger.warning(f"Error checking if URL is sitemap: {e}") return False