fix(uptime): If we fail to fetch robots.txt we should allow the url to be processed. (#74655)

wedamija · web-flow · commit d6e2e28934ed · 2024-07-22T13:57:56.000-07:00
We shouldn't block a url from detection if we fail to fetch the
robots.txt for that site
diff --git a/src/sentry/uptime/detectors/tasks.py b/src/sentry/uptime/detectors/tasks.py
@@ -267,7 +267,7 @@ def check_url_robots_txt(url: str) -> bool:
         return get_robots_txt_parser(url).can_fetch(UPTIME_USER_AGENT, url)
     except Exception:
         logger.warning("Failed to check robots.txt", exc_info=True)
-        return False
+        return True
 
 
 def get_robots_txt_parser(url: str) -> RobotFileParser:
diff --git a/tests/sentry/uptime/detectors/test_tasks.py b/tests/sentry/uptime/detectors/test_tasks.py
@@ -307,6 +307,15 @@ def test_no_robots_txt(self):
         ):
             assert process_candidate_url(self.project, 100, url, 50)
 
+    def test_error_robots_txt(self):
+        # Supplying no robots txt should allow all urls
+        url = "https://sentry.io"
+        with mock.patch(
+            "sentry.uptime.detectors.tasks.get_robots_txt_parser",
+            side_effect=Exception("Robots.txt fetch failed"),
+        ):
+            assert process_candidate_url(self.project, 100, url, 50)
+
 
 class TestFailedUrl(TestCase):
     def test(self):