diff --git a/python/src/server/services/crawling/helpers/url_handler.py b/python/src/server/services/crawling/helpers/url_handler.py index 33c75c5..97a9c5a 100644 --- a/python/src/server/services/crawling/helpers/url_handler.py +++ b/python/src/server/services/crawling/helpers/url_handler.py @@ -29,7 +29,10 @@ class URLHandler: True if URL is a sitemap, False otherwise """ try: - return url.endswith("sitemap.xml") or "sitemap" in urlparse(url).path + parsed = urlparse(url) + path = parsed.path.lower() + # Only match URLs that end with .xml and contain sitemap in the filename + return path.endswith(".xml") and "sitemap" in path except Exception as e: logger.warning(f"Error checking if URL is sitemap: {e}") return False