Skip to content

Commit

Permalink
added logic to handle network errors and timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
andreburgaud committed Sep 21, 2024
1 parent 0eaa230 commit 4758c40
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 4 deletions.
8 changes: 6 additions & 2 deletions robots/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,11 +228,15 @@ def gen_uri(self, uri: str):
if err.code in (401, 403):
self.disallow_all = True
self._errors.append((str(err.code), f"{str(err)} for {uri}"))
elif 400 <= err.code < 500:
elif 400 <= err.code < 500: # Unavailable status
self.allow_all = True
self._warnings.append((str(err.code), f"{str(err)} for {uri}"))
elif 500 <= err.code < 600: # Unreachable status
self.disallow_all = True
self._warnings.append((str(err.code), f"{str(err)} for {uri}"))
self.timestamp = 0
except urllib.error.URLError as err:
except urllib.error.URLError as err: # Unreachable status?
self.disallow_all = True
now = time.time()
duration = round(now - self.timestamp)
self._errors.append(("", f"{str(err)} for {uri} (duration={duration}s)"))
Expand Down
18 changes: 16 additions & 2 deletions tests/test_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ def parser():


def test_basic_disallow_all(parser):
assert parser.disallow_all is False
assert not parser.disallow_all


def test_basic_allow_all(parser):
assert parser.allow_all is False
assert not parser.allow_all


can_fetch_data = (
Expand Down Expand Up @@ -53,3 +53,17 @@ def test_utf16():
p = robots.RobotsParser.from_uri('https://robotspy.org/tests/robots_utf16.txt')
assert p.allow_all # robots file with unexpected encoding (must be UTF-8) => allow access to all paths
assert p.can_fetch('FooBot', '/admin')

def test_short_timeout():
p = robots.RobotsParser.from_uri("https://robotspy.org/robots.txt", 0)
assert p.errors
assert p.disallow_all
assert not p.can_fetch('FooBot', '/admin')

def test_error_timetout():
p = robots.RobotsParser.from_uri("https://robotspy.org:555/robots.txt", 1)

# The duration may be greater than the timeout because the urllib.request.urlopen timeout does not equate to a total timeout
assert p.errors
assert p.disallow_all
assert not p.can_fetch('FooBot', '/admin')

0 comments on commit 4758c40

Please sign in to comment.