gh-138907: Support RFC 9309 in robotparser (GH-138908)

* empty lines are always ignored instead of separating groups
* the "user-agent" line after a rule starts a new group
* groups matching the same user agent are now merged
* the rule with the longest match wins instead of the first matching rule
* in case of equal matches, the “Allow” rule wins over “Disallow”
* special characters “$” and “*” are now supported in rules
* prefer full match for user agent
This commit is contained in:
Serhiy Storchaka
2026-05-04 21:03:11 +03:00
committed by GitHub
parent c74cba16a3
commit bc285e5832
4 changed files with 441 additions and 111 deletions
+1 -1
View File
@@ -18,7 +18,7 @@
This module provides a single class, :class:`RobotFileParser`, which answers
questions about whether or not a particular user agent can fetch a URL on the
website that published the :file:`robots.txt` file. For more details on the
structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
structure of :file:`robots.txt` files, see :rfc:`9309`.
.. class:: RobotFileParser(url='')
+301 -39
View File
@@ -15,14 +15,18 @@ class BaseRobotTest:
good = []
bad = []
site_maps = None
expected_output = None
def __init_subclass__(cls):
super().__init_subclass__()
# Remove tests that do nothing.
if not cls.good:
cls.test_good_urls = None
if not cls.bad:
cls.test_bad_urls = None
if issubclass(cls, unittest.TestCase):
if not cls.good:
cls.test_good_urls = None
if not cls.bad:
cls.test_bad_urls = None
if cls.expected_output is None:
cls.test_string_formatting = None
def setUp(self):
lines = io.StringIO(self.robots_txt).readlines()
@@ -50,6 +54,8 @@ class BaseRobotTest:
def test_site_maps(self):
self.assertEqual(self.parser.site_maps(), self.site_maps)
def test_string_formatting(self):
self.assertEqual(str(self.parser), self.expected_output)
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
@@ -61,6 +67,56 @@ Disallow: /foo.html
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
class SimpleExampleTest(BaseRobotTest, unittest.TestCase):
# Example from RFC 9309, section 5.1.
robots_txt = """\
User-Agent: *
Disallow: *.gif$
Disallow: /example/
Allow: /publications/
User-Agent: foobot
Disallow:/
Allow:/example/page.html
Allow:/example/allowed.gif
User-Agent: barbot
User-Agent: bazbot
Disallow: /example/page.html
User-Agent: quxbot
"""
good = [
'/', '/publications/',
('foobot', '/example/page.html'), ('foobot', '/example/allowed.gif'),
('barbot', '/'), ('barbot', '/example/'),
('barbot', '/example/allowed.gif'),
('barbot', '/example/disallowed.gif'),
('barbot', '/publications/'),
('barbot', '/publications/allowed.gif'),
('bazbot', '/'), ('bazbot', '/example/'),
('bazbot', '/example/allowed.gif'),
('bazbot', '/example/disallowed.gif'),
('bazbot', '/publications/'),
('bazbot', '/publications/allowed.gif'),
('quxbot', '/'), ('quxbot', '/example/'),
('quxbot', '/example/page.html'), ('quxbot', '/example/allowed.gif'),
('quxbot', '/example/disallowed.gif'),
('quxbot', '/publications/'),
('quxbot', '/publications/allowed.gif'),
]
bad = [
'/example/', '/example/page.html', '/example/allowed.gif',
'/example/disallowed.gif',
'/publications/allowed.gif',
('foobot', '/'), ('foobot', '/example/'),
('foobot', '/example/disallowed.gif'),
('foobot', '/publications/'),
('foobot', '/publications/allowed.gif'),
('barbot', '/example/page.html'),
('bazbot', '/example/page.html'),
]
class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
@@ -102,7 +158,7 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
User-agent: *
Disallow: /
"""
good = []
good = ['/robots.txt']
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
@@ -137,6 +193,7 @@ class BaseRequestRateTest(BaseRobotTest):
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = ''
good = ['/foo']
expected_output = ''
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
@@ -203,35 +260,209 @@ Request-rate: whale/banana
class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
# the order of User-agent should be correct. note
# that this file is incorrect because "Googlebot" is a
# substring of "Googlebot-Mobile"
# the order of User-agent should not matter
robots_txt = """\
User-agent: Googlebot
Disallow: /
Allow: /folder1/
User-agent: Googlebot-Mobile
Allow: /
Disallow: /folder1/
"""
agent = 'Googlebot'
bad = ['/something.jpg']
good = ['/folder1/myfile.html']
class UserAgentGoogleMobileTest(UserAgentOrderingTest):
agent = 'Googlebot-Mobile'
agent = 'Googlebot-mobile'
bad = ['/folder1/myfile.html']
good = ['/something.jpg']
class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
# Google also got the order wrong. You need
# to specify the URLs from more specific to more general
class LongestMatchTest(BaseRobotTest, unittest.TestCase):
# Based on example from RFC 9309, section 5.2.
robots_txt = """\
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
User-agent: *
Allow: /example/page/
Disallow: /example/page/disallowed.gif
Allow: /example/
"""
agent = 'googlebot'
good = ['/folder1/myfile.html']
bad = ['/folder1/anotherfile.html']
good = ['/example/', '/example/page/']
bad = ['/example/page/disallowed.gif']
class LongestMatchWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Allow: /example/page/
Disallow: *.gif
Allow: /example/
"""
good = ['/example/', '/example/page/']
bad = ['/example/page/disallowed.gif', '/x.gif']
class AllowWinsEqualMatchTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /spam
Allow: /spam
Disallow: /spam
"""
good = ['/spam', '/spam/']
class AllowWinsEqualFullMatchTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /spam
Allow: /spam$
Disallow: /spam
Disallow: /eggs$
Allow: /eggs
Disallow: /eggs$
"""
good = ['/spam', '/eggs', '/eggs/']
bad = ['/spam/']
class AllowWinsEqualMatchWildcardTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /spam
Allow: *am
Disallow: /spam
Disallow: *gs
Allow: /eggs
Disallow: *gs
"""
good = ['/spam', '/eggs', '/spam/', '/eggs/']
class MergeGroupsTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: spambot
Disallow: /some/path
User-agent: spambot
Disallow: /another/path
"""
agent = 'spambot'
bad = ['/some/path', '/another/path']
class UserAgentStartsGroupTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: spambot
Disallow: /some/path
User-agent: eggsbot
Disallow: /another/path
"""
good = [('spambot', '/'), ('spambot', '/another/path'),
('eggsbot', '/'), ('eggsbot', '/some/path')]
bad = [('spambot', '/some/path'), ('eggsbot', '/another/path')]
expected_output = """\
User-agent: spambot
Disallow: /some/path
User-agent: eggsbot
Disallow: /another/path\
"""
class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: spambot
User-agent: eggsbot
Disallow: /some/path
Disallow: /another/path
"""
good = [('spambot', '/'), ('eggsbot', '/')]
bad = [
('spambot', '/some/path'), ('spambot', '/another/path'),
('eggsbot', '/some/path'), ('eggsbot', '/another/path'),
]
expected_output = """\
User-agent: spambot
User-agent: eggsbot
Disallow: /some/path
Disallow: /another/path\
"""
class IgnoreRulesWithoutUserAgentTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
Disallow: /some/path
User-agent: *
Disallow: /another/path
"""
good = ['/', '/some/path']
bad = ['/another/path']
expected_output = """\
User-agent: *
Disallow: /another/path\
"""
class EmptyGroupTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Disallow: /some/path
User-agent: spambot
"""
agent = 'spambot'
good = ['/', '/some/path']
expected_output = """\
User-agent: *
Disallow: /some/path
User-agent: spambot
Allow:\
"""
class WeirdPathTest(BaseRobotTest, unittest.TestCase):
robots_txt = f"""\
User-agent: *
Disallow: /a$$$
Disallow: /b$z
Disallow: /c***
Disallow: /d***z
Disallow: /e*$**$$
Disallow: /f*$**$$z
Disallow: /g$*$$**
Disallow: /h$*$$**z
"""
good = ['/ax', '/a$$', '/b', '/bz', '/b$z', '/d', '/f', '/fz',
'/f$$$z', '/fx$y$$z', '/gx', '/g$$$', '/g$x$$y', '/h', '/hz',
'/h$$$z', '/h$x$$yz']
bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy',
'/e$$', '/ex$y$', '/g']
expected_output = """\
User-agent: *
Disallow: /a$
Disallow: /c*
Disallow: /d*z
Disallow: /e*$
Disallow: /g$\
"""
class PathWithManyWildcardsTest(BaseRobotTest, unittest.TestCase):
# This test would take many years if use naive translation to regular
# expression (* -> .*).
N = 50
robots_txt = f"""\
User-agent: *
Disallow: /{'*a'*N}*b
"""
good = ['/' + 'a'*N + 'a']
bad = ['/' + 'a'*N + 'b']
class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
@@ -245,25 +476,13 @@ Disallow: /yet/one/path?name=value&more
good = ['/some/path', '/some/path?',
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
'/another/path', '/another/path%3F',
'/yet/one/path?name=value%26more']
'/yet/one/path?name=value%26more',
'/some/pathxname=value']
bad = ['/some/path?name=value'
'/another/path?', '/another/path?name=value',
'/yet/one/path?name=value&more']
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
# obey first * entry (#4108)
robots_txt = """\
User-agent: *
Disallow: /some/path
User-agent: *
Disallow: /another/path
"""
good = ['/another/path']
bad = ['/some/path']
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
robots_txt = """\
User-agent: *
@@ -365,17 +584,60 @@ Disallow: /some/path
"""
expected_output = """\
User-agent: cybermapper
Disallow: /some/path
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/\
Disallow: /cyberworld/map/
User-agent: cybermapper
Disallow: /some/path\
"""
def test_string_formatting(self):
self.assertEqual(str(self.parser), self.expected_output)
class ConstructedStringFormattingTest(unittest.TestCase):
def test_empty(self):
parser = urllib.robotparser.RobotFileParser()
self.assertEqual(str(parser), '')
def test_group_without_rules(self):
parser = urllib.robotparser.RobotFileParser()
entry = urllib.robotparser.Entry()
entry.useragents = ['spambot']
parser._add_entry(entry)
entry = urllib.robotparser.Entry()
entry.useragents = ['hambot']
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
parser._add_entry(entry)
entry = urllib.robotparser.Entry()
entry.useragents = ['eggsbot']
parser._add_entry(entry)
self.assertEqual(str(parser), """\
User-agent: spambot
Allow:
User-agent: hambot
Disallow: /ham
User-agent: eggsbot
Allow:\
""")
def test_group_without_user_agent(self):
parser = urllib.robotparser.RobotFileParser()
entry = urllib.robotparser.Entry()
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
parser._add_entry(entry)
entry = urllib.robotparser.Entry()
entry.useragents = ['spambot']
entry.rulelines = [urllib.robotparser.RuleLine('/spam', False)]
parser._add_entry(entry)
entry = urllib.robotparser.Entry()
entry.rulelines = [urllib.robotparser.RuleLine('/eggs', False)]
parser._add_entry(entry)
self.assertEqual(str(parser), """\
User-agent: spambot
Disallow: /spam\
""")
@unittest.skipUnless(
@@ -495,7 +757,7 @@ class NetworkTestCase(unittest.TestCase):
def test_can_fetch(self):
self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))
self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
self.assertTrue(self.parser.can_fetch('*', self.base_url))
+138 -71
View File
@@ -7,7 +7,7 @@
2) PSF license for Python 2.2
The robots.txt Exclusion Protocol is implemented as specified in
http://www.robotstxt.org/norobots-rfc.txt
RFC 9309
"""
import collections
@@ -21,19 +21,6 @@ __all__ = ["RobotFileParser"]
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
def normalize(path):
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
return urllib.parse.quote(unquoted, errors='surrogateescape')
def normalize_path(path):
path, sep, query = path.partition('?')
path = normalize(path)
if sep:
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
path += '?' + query
return path
class RobotFileParser:
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
@@ -42,6 +29,7 @@ class RobotFileParser:
def __init__(self, url=''):
self.entries = []
self.groups = {}
self.sitemaps = []
self.default_entry = None
self.disallow_all = False
@@ -86,13 +74,13 @@ class RobotFileParser:
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
def _add_entry(self, entry):
if "*" in entry.useragents:
# the default entry is considered last
if self.default_entry is None:
# the first default entry wins
self.default_entry = entry
else:
self.entries.append(entry)
self.entries.append(entry)
for agent in entry.useragents:
agent = agent.lower()
if agent not in self.groups:
self.groups[agent] = entry
else:
self.groups[agent] = merge_entries(self.groups[agent], entry)
def parse(self, lines):
"""Parse the input lines from a robots.txt file.
@@ -100,6 +88,7 @@ class RobotFileParser:
We allow that a user-agent: line is not preceded by
one or more blank lines.
"""
entries = []
# states:
# 0: start state
# 1: saw user-agent line
@@ -109,14 +98,6 @@ class RobotFileParser:
self.modified()
for line in lines:
if not line:
if state == 1:
entry = Entry()
state = 0
elif state == 2:
self._add_entry(entry)
entry = Entry()
state = 0
# remove optional comment and strip line
i = line.find('#')
if i >= 0:
@@ -132,16 +113,23 @@ class RobotFileParser:
if state == 2:
self._add_entry(entry)
entry = Entry()
entry.useragents.append(line[1])
product_token = line[1]
entry.useragents.append(product_token)
state = 1
elif line[0] == "disallow":
if state != 0:
entry.rulelines.append(RuleLine(line[1], False))
state = 2
try:
entry.rulelines.append(RuleLine(line[1], False))
except ValueError:
pass
elif line[0] == "allow":
if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
state = 2
try:
entry.rulelines.append(RuleLine(line[1], True))
except ValueError:
pass
elif line[0] == "crawl-delay":
if state != 0:
# before trying to convert to int we need to make
@@ -164,9 +152,18 @@ class RobotFileParser:
# so it doesn't matter where you place it in your file."
# Therefore we do not change the state of the parser.
self.sitemaps.append(line[1])
if state == 2:
if state != 0:
self._add_entry(entry)
def _find_entry(self, useragent):
entry = self.groups.get(useragent.lower())
if entry is not None:
return entry
for entry in self.groups.values():
if entry.applies_to(useragent):
return entry
return self.groups.get('*')
def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url"""
if self.disallow_all:
@@ -179,43 +176,36 @@ class RobotFileParser:
# calls can_fetch() before calling read().
if not self.last_checked:
return False
# search for given user agent matches
# the first match counts
# TODO: The private API is used in order to preserve an empty query.
# This is temporary until the public API starts supporting this feature.
parsed_url = urllib.parse._urlsplit(url, '')
url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
url = normalize_path(url)
url = normalize_uri(url)
if not url:
url = "/"
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
# try the default entry last
if self.default_entry:
return self.default_entry.allowance(url)
# agent not found ==> access granted
return True
if url == '/robots.txt':
# The /robots.txt URI is implicitly allowed.
return True
entry = self._find_entry(useragent)
if entry is None:
return True
return entry.allowance(url)
def crawl_delay(self, useragent):
if not self.mtime():
return None
for entry in self.entries:
if entry.applies_to(useragent):
return entry.delay
if self.default_entry:
return self.default_entry.delay
return None
entry = self._find_entry(useragent)
if entry is None:
return None
return entry.delay
def request_rate(self, useragent):
if not self.mtime():
return None
for entry in self.entries:
if entry.applies_to(useragent):
return entry.req_rate
if self.default_entry:
return self.default_entry.req_rate
return None
entry = self._find_entry(useragent)
if entry is None:
return None
return entry.req_rate
def site_maps(self):
if not self.sitemaps:
@@ -226,7 +216,7 @@ class RobotFileParser:
entries = self.entries
if self.default_entry is not None:
entries = entries + [self.default_entry]
return '\n\n'.join(map(str, entries))
return '\n\n'.join(filter(None, map(str, entries)))
class RuleLine:
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
@@ -235,14 +225,42 @@ class RuleLine:
if path == '' and not allowance:
# an empty value means allow all
allowance = True
self.path = normalize_path(path)
path = re.sub(r'[*]{2,}', '*', path)
path = re.sub(r'[$][$*]+', '$', path)
path = normalize_pattern(path)
self.fullmatch = path.endswith('$')
path = path.rstrip('$')
if '$' in path:
raise ValueError('$ not at the end of path')
self.matcher = None
if '*' in path:
pattern = re.compile(translate_pattern(path), re.DOTALL)
if self.fullmatch:
self.matcher = pattern.fullmatch
else:
self.matcher = pattern.match
self.path = path
self.allowance = allowance
def applies_to(self, filename):
return self.path == "*" or filename.startswith(self.path)
# If the filename matches the rule, return the matching length plus 1.
# If it does not match, return 0.
if self.matcher is not None:
m = self.matcher(filename)
if m:
return m.end() + 1
else:
if self.fullmatch:
if filename == self.path:
return len(self.path) + 1
else:
if filename.startswith(self.path):
return len(self.path) + 1
return 0
def __str__(self):
return ("Allow" if self.allowance else "Disallow") + ": " + self.path
return (("Allow" if self.allowance else "Disallow") + ": " + self.path
+ ('$' if self.fullmatch else ''))
class Entry:
@@ -254,6 +272,8 @@ class Entry:
self.req_rate = None
def __str__(self):
if not self.useragents:
return ''
ret = []
for agent in self.useragents:
ret.append(f"User-agent: {agent}")
@@ -262,27 +282,74 @@ class Entry:
if self.req_rate is not None:
rate = self.req_rate
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
ret.extend(map(str, self.rulelines))
if self.rulelines:
ret.extend(map(str, self.rulelines))
else:
ret.append("Allow:")
return '\n'.join(ret)
def applies_to(self, useragent):
"""check if this entry applies to the specified agent"""
if useragent is None:
return '*' in self.useragents
# split the name token and make it lower case
useragent = useragent.split("/")[0].lower()
for agent in self.useragents:
if agent == '*':
# we have the catch-all agent
return True
agent = agent.lower()
if agent in useragent:
return True
if agent != '*':
agent = agent.lower()
if agent in useragent:
return True
return False
def allowance(self, filename):
"""Preconditions:
- our agent applies to this entry
- filename is URL encoded"""
- filename is URL encoded
"""
best_match = -1
allowance = True
for line in self.rulelines:
if line.applies_to(filename):
return line.allowance
return True
m = line.applies_to(filename)
if m:
if m > best_match:
best_match = m
allowance = line.allowance
elif m == best_match and not allowance:
allowance = line.allowance
return allowance
def normalize(path):
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
return urllib.parse.quote(unquoted, errors='surrogateescape')
def normalize_uri(path):
path, sep, query = path.partition('?')
path = normalize(path)
if sep:
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
path += '?' + query
return path
def normalize_pattern(path):
path, sep, query = path.partition('?')
path = re.sub(r'[^*$]+', lambda m: normalize(m[0]), path)
if sep:
query = re.sub(r'[^=&*$]+', lambda m: normalize(m[0]), query)
path += '?' + query
return path
def translate_pattern(path):
parts = list(map(re.escape, path.split('*')))
for i in range(1, len(parts)-1):
parts[i] = f'(?>.*?{parts[i]})'
parts[-1] = f'.*{parts[-1]}'
return ''.join(parts)
def merge_entries(e1, e2):
entry = Entry()
entry.useragents = list(filter(set(e2.useragents).__contains__, e1.useragents))
entry.rulelines = e1.rulelines + e2.rulelines
entry.delay = e1.delay if e2.delay is None else e2.delay
entry.req_rate = e1.req_rate if e2.req_rate is None else e2.req_rate
return entry
@@ -0,0 +1 @@
Support :rfc:`9309` in :mod:`urllib.robotparser`.