added logic to handle network errors and timeout

andreburgaud · Sep 21, 2024 · 4758c40 · 4758c40
1 parent 0eaa230
commit 4758c40
Show file tree

Hide file tree

Showing 2 changed files with 22 additions and 4 deletions.
diff --git a/robots/parser.py b/robots/parser.py
@@ -228,11 +228,15 @@ def gen_uri(self, uri: str):
             if err.code in (401, 403):
                 self.disallow_all = True
                 self._errors.append((str(err.code), f"{str(err)} for {uri}"))
-            elif 400 <= err.code < 500:
+            elif 400 <= err.code < 500: # Unavailable status
                 self.allow_all = True
                 self._warnings.append((str(err.code), f"{str(err)} for {uri}"))
+            elif 500 <= err.code < 600: # Unreachable status
+                self.disallow_all = True
+                self._warnings.append((str(err.code), f"{str(err)} for {uri}"))
             self.timestamp = 0
-        except urllib.error.URLError as err:
+        except urllib.error.URLError as err: # Unreachable status?
+            self.disallow_all = True
             now = time.time()
             duration = round(now - self.timestamp)
             self._errors.append(("", f"{str(err)} for {uri} (duration={duration}s)"))

diff --git a/tests/test_network.py b/tests/test_network.py
@@ -18,11 +18,11 @@ def parser():
 
 
 def test_basic_disallow_all(parser):
-    assert parser.disallow_all is False
+    assert not parser.disallow_all
 
 
 def test_basic_allow_all(parser):
-    assert parser.allow_all is False
+    assert not parser.allow_all
 
 
 can_fetch_data = (
@@ -53,3 +53,17 @@ def test_utf16():
     p = robots.RobotsParser.from_uri('https://robotspy.org/tests/robots_utf16.txt')
     assert p.allow_all  # robots file with unexpected encoding (must be UTF-8) => allow access to all paths
     assert p.can_fetch('FooBot', '/admin')
+
+def test_short_timeout():
+    p = robots.RobotsParser.from_uri("https://robotspy.org/robots.txt", 0)
+    assert p.errors
+    assert p.disallow_all
+    assert not p.can_fetch('FooBot', '/admin')
+
+def test_error_timetout():
+    p = robots.RobotsParser.from_uri("https://robotspy.org:555/robots.txt", 1)
+
+    # The duration may be greater than the timeout because the urllib.request.urlopen timeout does not equate to a total timeout
+    assert p.errors
+    assert p.disallow_all
+    assert not p.can_fetch('FooBot', '/admin')