mirror of
https://github.com/python/cpython.git
synced 2026-05-06 04:37:33 -04:00
Merge branch '3.14' of https://github.com/python/cpython into 3.14
This commit is contained in:
@@ -20,7 +20,7 @@
|
||||
This module provides a single class, :class:`RobotFileParser`, which answers
|
||||
questions about whether or not a particular user agent can fetch a URL on the
|
||||
website that published the :file:`robots.txt` file. For more details on the
|
||||
structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
|
||||
structure of :file:`robots.txt` files, see :rfc:`9309`.
|
||||
|
||||
|
||||
.. class:: RobotFileParser(url='')
|
||||
|
||||
+301
-39
@@ -15,14 +15,18 @@ class BaseRobotTest:
|
||||
good = []
|
||||
bad = []
|
||||
site_maps = None
|
||||
expected_output = None
|
||||
|
||||
def __init_subclass__(cls):
|
||||
super().__init_subclass__()
|
||||
# Remove tests that do nothing.
|
||||
if not cls.good:
|
||||
cls.test_good_urls = None
|
||||
if not cls.bad:
|
||||
cls.test_bad_urls = None
|
||||
if issubclass(cls, unittest.TestCase):
|
||||
if not cls.good:
|
||||
cls.test_good_urls = None
|
||||
if not cls.bad:
|
||||
cls.test_bad_urls = None
|
||||
if cls.expected_output is None:
|
||||
cls.test_string_formatting = None
|
||||
|
||||
def setUp(self):
|
||||
lines = io.StringIO(self.robots_txt).readlines()
|
||||
@@ -50,6 +54,8 @@ class BaseRobotTest:
|
||||
def test_site_maps(self):
|
||||
self.assertEqual(self.parser.site_maps(), self.site_maps)
|
||||
|
||||
def test_string_formatting(self):
|
||||
self.assertEqual(str(self.parser), self.expected_output)
|
||||
|
||||
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
@@ -61,6 +67,56 @@ Disallow: /foo.html
|
||||
good = ['/', '/test.html']
|
||||
bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
|
||||
|
||||
class SimpleExampleTest(BaseRobotTest, unittest.TestCase):
|
||||
# Example from RFC 9309, section 5.1.
|
||||
robots_txt = """\
|
||||
User-Agent: *
|
||||
Disallow: *.gif$
|
||||
Disallow: /example/
|
||||
Allow: /publications/
|
||||
|
||||
User-Agent: foobot
|
||||
Disallow:/
|
||||
Allow:/example/page.html
|
||||
Allow:/example/allowed.gif
|
||||
|
||||
User-Agent: barbot
|
||||
User-Agent: bazbot
|
||||
Disallow: /example/page.html
|
||||
|
||||
User-Agent: quxbot
|
||||
"""
|
||||
good = [
|
||||
'/', '/publications/',
|
||||
('foobot', '/example/page.html'), ('foobot', '/example/allowed.gif'),
|
||||
('barbot', '/'), ('barbot', '/example/'),
|
||||
('barbot', '/example/allowed.gif'),
|
||||
('barbot', '/example/disallowed.gif'),
|
||||
('barbot', '/publications/'),
|
||||
('barbot', '/publications/allowed.gif'),
|
||||
('bazbot', '/'), ('bazbot', '/example/'),
|
||||
('bazbot', '/example/allowed.gif'),
|
||||
('bazbot', '/example/disallowed.gif'),
|
||||
('bazbot', '/publications/'),
|
||||
('bazbot', '/publications/allowed.gif'),
|
||||
('quxbot', '/'), ('quxbot', '/example/'),
|
||||
('quxbot', '/example/page.html'), ('quxbot', '/example/allowed.gif'),
|
||||
('quxbot', '/example/disallowed.gif'),
|
||||
('quxbot', '/publications/'),
|
||||
('quxbot', '/publications/allowed.gif'),
|
||||
]
|
||||
bad = [
|
||||
'/example/', '/example/page.html', '/example/allowed.gif',
|
||||
'/example/disallowed.gif',
|
||||
'/publications/allowed.gif',
|
||||
('foobot', '/'), ('foobot', '/example/'),
|
||||
('foobot', '/example/disallowed.gif'),
|
||||
('foobot', '/publications/'),
|
||||
('foobot', '/publications/allowed.gif'),
|
||||
('barbot', '/example/page.html'),
|
||||
('bazbot', '/example/page.html'),
|
||||
]
|
||||
|
||||
|
||||
class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
@@ -102,7 +158,7 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
|
||||
User-agent: *
|
||||
Disallow: /
|
||||
"""
|
||||
good = []
|
||||
good = ['/robots.txt']
|
||||
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
|
||||
|
||||
|
||||
@@ -137,6 +193,7 @@ class BaseRequestRateTest(BaseRobotTest):
|
||||
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
|
||||
robots_txt = ''
|
||||
good = ['/foo']
|
||||
expected_output = ''
|
||||
|
||||
|
||||
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
|
||||
@@ -203,35 +260,209 @@ Request-rate: whale/banana
|
||||
|
||||
|
||||
class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
|
||||
# the order of User-agent should be correct. note
|
||||
# that this file is incorrect because "Googlebot" is a
|
||||
# substring of "Googlebot-Mobile"
|
||||
# the order of User-agent should not matter
|
||||
robots_txt = """\
|
||||
User-agent: Googlebot
|
||||
Disallow: /
|
||||
Allow: /folder1/
|
||||
|
||||
User-agent: Googlebot-Mobile
|
||||
Allow: /
|
||||
Disallow: /folder1/
|
||||
"""
|
||||
agent = 'Googlebot'
|
||||
bad = ['/something.jpg']
|
||||
good = ['/folder1/myfile.html']
|
||||
|
||||
|
||||
class UserAgentGoogleMobileTest(UserAgentOrderingTest):
|
||||
agent = 'Googlebot-Mobile'
|
||||
agent = 'Googlebot-mobile'
|
||||
bad = ['/folder1/myfile.html']
|
||||
good = ['/something.jpg']
|
||||
|
||||
|
||||
class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
|
||||
# Google also got the order wrong. You need
|
||||
# to specify the URLs from more specific to more general
|
||||
class LongestMatchTest(BaseRobotTest, unittest.TestCase):
|
||||
# Based on example from RFC 9309, section 5.2.
|
||||
robots_txt = """\
|
||||
User-agent: Googlebot
|
||||
Allow: /folder1/myfile.html
|
||||
Disallow: /folder1/
|
||||
User-agent: *
|
||||
Allow: /example/page/
|
||||
Disallow: /example/page/disallowed.gif
|
||||
Allow: /example/
|
||||
"""
|
||||
agent = 'googlebot'
|
||||
good = ['/folder1/myfile.html']
|
||||
bad = ['/folder1/anotherfile.html']
|
||||
good = ['/example/', '/example/page/']
|
||||
bad = ['/example/page/disallowed.gif']
|
||||
|
||||
|
||||
class LongestMatchWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
User-agent: *
|
||||
Allow: /example/page/
|
||||
Disallow: *.gif
|
||||
Allow: /example/
|
||||
"""
|
||||
good = ['/example/', '/example/page/']
|
||||
bad = ['/example/page/disallowed.gif', '/x.gif']
|
||||
|
||||
|
||||
class AllowWinsEqualMatchTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
User-agent: *
|
||||
Disallow: /spam
|
||||
Allow: /spam
|
||||
Disallow: /spam
|
||||
"""
|
||||
good = ['/spam', '/spam/']
|
||||
|
||||
|
||||
class AllowWinsEqualFullMatchTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
User-agent: *
|
||||
Disallow: /spam
|
||||
Allow: /spam$
|
||||
Disallow: /spam
|
||||
Disallow: /eggs$
|
||||
Allow: /eggs
|
||||
Disallow: /eggs$
|
||||
"""
|
||||
good = ['/spam', '/eggs', '/eggs/']
|
||||
bad = ['/spam/']
|
||||
|
||||
|
||||
class AllowWinsEqualMatchWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
User-agent: *
|
||||
Disallow: /spam
|
||||
Allow: *am
|
||||
Disallow: /spam
|
||||
Disallow: *gs
|
||||
Allow: /eggs
|
||||
Disallow: *gs
|
||||
"""
|
||||
good = ['/spam', '/eggs', '/spam/', '/eggs/']
|
||||
|
||||
|
||||
class MergeGroupsTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
User-agent: spambot
|
||||
Disallow: /some/path
|
||||
|
||||
User-agent: spambot
|
||||
Disallow: /another/path
|
||||
"""
|
||||
agent = 'spambot'
|
||||
bad = ['/some/path', '/another/path']
|
||||
|
||||
|
||||
class UserAgentStartsGroupTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
User-agent: spambot
|
||||
Disallow: /some/path
|
||||
User-agent: eggsbot
|
||||
Disallow: /another/path
|
||||
"""
|
||||
good = [('spambot', '/'), ('spambot', '/another/path'),
|
||||
('eggsbot', '/'), ('eggsbot', '/some/path')]
|
||||
bad = [('spambot', '/some/path'), ('eggsbot', '/another/path')]
|
||||
expected_output = """\
|
||||
User-agent: spambot
|
||||
Disallow: /some/path
|
||||
|
||||
User-agent: eggsbot
|
||||
Disallow: /another/path\
|
||||
"""
|
||||
|
||||
class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
User-agent: spambot
|
||||
|
||||
User-agent: eggsbot
|
||||
Disallow: /some/path
|
||||
|
||||
Disallow: /another/path
|
||||
"""
|
||||
good = [('spambot', '/'), ('eggsbot', '/')]
|
||||
bad = [
|
||||
('spambot', '/some/path'), ('spambot', '/another/path'),
|
||||
('eggsbot', '/some/path'), ('eggsbot', '/another/path'),
|
||||
]
|
||||
expected_output = """\
|
||||
User-agent: spambot
|
||||
User-agent: eggsbot
|
||||
Disallow: /some/path
|
||||
Disallow: /another/path\
|
||||
"""
|
||||
|
||||
|
||||
class IgnoreRulesWithoutUserAgentTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
Disallow: /some/path
|
||||
|
||||
User-agent: *
|
||||
Disallow: /another/path
|
||||
"""
|
||||
good = ['/', '/some/path']
|
||||
bad = ['/another/path']
|
||||
expected_output = """\
|
||||
User-agent: *
|
||||
Disallow: /another/path\
|
||||
"""
|
||||
|
||||
|
||||
class EmptyGroupTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
User-agent: *
|
||||
Disallow: /some/path
|
||||
|
||||
User-agent: spambot
|
||||
"""
|
||||
agent = 'spambot'
|
||||
good = ['/', '/some/path']
|
||||
expected_output = """\
|
||||
User-agent: *
|
||||
Disallow: /some/path
|
||||
|
||||
User-agent: spambot
|
||||
Allow:\
|
||||
"""
|
||||
|
||||
|
||||
class WeirdPathTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = f"""\
|
||||
User-agent: *
|
||||
Disallow: /a$$$
|
||||
Disallow: /b$z
|
||||
Disallow: /c***
|
||||
Disallow: /d***z
|
||||
Disallow: /e*$**$$
|
||||
Disallow: /f*$**$$z
|
||||
Disallow: /g$*$$**
|
||||
Disallow: /h$*$$**z
|
||||
"""
|
||||
good = ['/ax', '/a$$', '/b', '/bz', '/b$z', '/d', '/f', '/fz',
|
||||
'/f$$$z', '/fx$y$$z', '/gx', '/g$$$', '/g$x$$y', '/h', '/hz',
|
||||
'/h$$$z', '/h$x$$yz']
|
||||
bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy',
|
||||
'/e$$', '/ex$y$', '/g']
|
||||
expected_output = """\
|
||||
User-agent: *
|
||||
Disallow: /a$
|
||||
Disallow: /c*
|
||||
Disallow: /d*z
|
||||
Disallow: /e*$
|
||||
Disallow: /g$\
|
||||
"""
|
||||
|
||||
|
||||
class PathWithManyWildcardsTest(BaseRobotTest, unittest.TestCase):
|
||||
# This test would take many years if use naive translation to regular
|
||||
# expression (* -> .*).
|
||||
N = 50
|
||||
robots_txt = f"""\
|
||||
User-agent: *
|
||||
Disallow: /{'*a'*N}*b
|
||||
"""
|
||||
good = ['/' + 'a'*N + 'a']
|
||||
bad = ['/' + 'a'*N + 'b']
|
||||
|
||||
|
||||
class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
|
||||
@@ -245,25 +476,13 @@ Disallow: /yet/one/path?name=value&more
|
||||
good = ['/some/path', '/some/path?',
|
||||
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
|
||||
'/another/path', '/another/path%3F',
|
||||
'/yet/one/path?name=value%26more']
|
||||
'/yet/one/path?name=value%26more',
|
||||
'/some/pathxname=value']
|
||||
bad = ['/some/path?name=value'
|
||||
'/another/path?', '/another/path?name=value',
|
||||
'/yet/one/path?name=value&more']
|
||||
|
||||
|
||||
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||
# obey first * entry (#4108)
|
||||
robots_txt = """\
|
||||
User-agent: *
|
||||
Disallow: /some/path
|
||||
|
||||
User-agent: *
|
||||
Disallow: /another/path
|
||||
"""
|
||||
good = ['/another/path']
|
||||
bad = ['/some/path']
|
||||
|
||||
|
||||
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
|
||||
robots_txt = """\
|
||||
User-agent: *
|
||||
@@ -365,17 +584,60 @@ Disallow: /some/path
|
||||
"""
|
||||
|
||||
expected_output = """\
|
||||
User-agent: cybermapper
|
||||
Disallow: /some/path
|
||||
|
||||
User-agent: *
|
||||
Crawl-delay: 1
|
||||
Request-rate: 3/15
|
||||
Disallow: /cyberworld/map/\
|
||||
Disallow: /cyberworld/map/
|
||||
|
||||
User-agent: cybermapper
|
||||
Disallow: /some/path\
|
||||
"""
|
||||
|
||||
def test_string_formatting(self):
|
||||
self.assertEqual(str(self.parser), self.expected_output)
|
||||
|
||||
class ConstructedStringFormattingTest(unittest.TestCase):
|
||||
def test_empty(self):
|
||||
parser = urllib.robotparser.RobotFileParser()
|
||||
self.assertEqual(str(parser), '')
|
||||
|
||||
def test_group_without_rules(self):
|
||||
parser = urllib.robotparser.RobotFileParser()
|
||||
entry = urllib.robotparser.Entry()
|
||||
entry.useragents = ['spambot']
|
||||
parser._add_entry(entry)
|
||||
entry = urllib.robotparser.Entry()
|
||||
entry.useragents = ['hambot']
|
||||
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
|
||||
parser._add_entry(entry)
|
||||
entry = urllib.robotparser.Entry()
|
||||
entry.useragents = ['eggsbot']
|
||||
parser._add_entry(entry)
|
||||
self.assertEqual(str(parser), """\
|
||||
User-agent: spambot
|
||||
Allow:
|
||||
|
||||
User-agent: hambot
|
||||
Disallow: /ham
|
||||
|
||||
User-agent: eggsbot
|
||||
Allow:\
|
||||
""")
|
||||
|
||||
def test_group_without_user_agent(self):
|
||||
parser = urllib.robotparser.RobotFileParser()
|
||||
entry = urllib.robotparser.Entry()
|
||||
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
|
||||
parser._add_entry(entry)
|
||||
entry = urllib.robotparser.Entry()
|
||||
entry.useragents = ['spambot']
|
||||
entry.rulelines = [urllib.robotparser.RuleLine('/spam', False)]
|
||||
parser._add_entry(entry)
|
||||
entry = urllib.robotparser.Entry()
|
||||
entry.rulelines = [urllib.robotparser.RuleLine('/eggs', False)]
|
||||
parser._add_entry(entry)
|
||||
self.assertEqual(str(parser), """\
|
||||
User-agent: spambot
|
||||
Disallow: /spam\
|
||||
""")
|
||||
|
||||
|
||||
@unittest.skipUnless(
|
||||
@@ -495,7 +757,7 @@ class NetworkTestCase(unittest.TestCase):
|
||||
def test_can_fetch(self):
|
||||
self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
|
||||
self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
|
||||
self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
|
||||
self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))
|
||||
self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
|
||||
self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
|
||||
self.assertTrue(self.parser.can_fetch('*', self.base_url))
|
||||
|
||||
+138
-71
@@ -7,7 +7,7 @@
|
||||
2) PSF license for Python 2.2
|
||||
|
||||
The robots.txt Exclusion Protocol is implemented as specified in
|
||||
http://www.robotstxt.org/norobots-rfc.txt
|
||||
RFC 9309
|
||||
"""
|
||||
|
||||
import collections
|
||||
@@ -21,19 +21,6 @@ __all__ = ["RobotFileParser"]
|
||||
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
|
||||
|
||||
|
||||
def normalize(path):
|
||||
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
|
||||
return urllib.parse.quote(unquoted, errors='surrogateescape')
|
||||
|
||||
def normalize_path(path):
|
||||
path, sep, query = path.partition('?')
|
||||
path = normalize(path)
|
||||
if sep:
|
||||
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
|
||||
path += '?' + query
|
||||
return path
|
||||
|
||||
|
||||
class RobotFileParser:
|
||||
""" This class provides a set of methods to read, parse and answer
|
||||
questions about a single robots.txt file.
|
||||
@@ -42,6 +29,7 @@ class RobotFileParser:
|
||||
|
||||
def __init__(self, url=''):
|
||||
self.entries = []
|
||||
self.groups = {}
|
||||
self.sitemaps = []
|
||||
self.default_entry = None
|
||||
self.disallow_all = False
|
||||
@@ -86,13 +74,13 @@ class RobotFileParser:
|
||||
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
|
||||
|
||||
def _add_entry(self, entry):
|
||||
if "*" in entry.useragents:
|
||||
# the default entry is considered last
|
||||
if self.default_entry is None:
|
||||
# the first default entry wins
|
||||
self.default_entry = entry
|
||||
else:
|
||||
self.entries.append(entry)
|
||||
self.entries.append(entry)
|
||||
for agent in entry.useragents:
|
||||
agent = agent.lower()
|
||||
if agent not in self.groups:
|
||||
self.groups[agent] = entry
|
||||
else:
|
||||
self.groups[agent] = merge_entries(self.groups[agent], entry)
|
||||
|
||||
def parse(self, lines):
|
||||
"""Parse the input lines from a robots.txt file.
|
||||
@@ -100,6 +88,7 @@ class RobotFileParser:
|
||||
We allow that a user-agent: line is not preceded by
|
||||
one or more blank lines.
|
||||
"""
|
||||
entries = []
|
||||
# states:
|
||||
# 0: start state
|
||||
# 1: saw user-agent line
|
||||
@@ -109,14 +98,6 @@ class RobotFileParser:
|
||||
|
||||
self.modified()
|
||||
for line in lines:
|
||||
if not line:
|
||||
if state == 1:
|
||||
entry = Entry()
|
||||
state = 0
|
||||
elif state == 2:
|
||||
self._add_entry(entry)
|
||||
entry = Entry()
|
||||
state = 0
|
||||
# remove optional comment and strip line
|
||||
i = line.find('#')
|
||||
if i >= 0:
|
||||
@@ -132,16 +113,23 @@ class RobotFileParser:
|
||||
if state == 2:
|
||||
self._add_entry(entry)
|
||||
entry = Entry()
|
||||
entry.useragents.append(line[1])
|
||||
product_token = line[1]
|
||||
entry.useragents.append(product_token)
|
||||
state = 1
|
||||
elif line[0] == "disallow":
|
||||
if state != 0:
|
||||
entry.rulelines.append(RuleLine(line[1], False))
|
||||
state = 2
|
||||
try:
|
||||
entry.rulelines.append(RuleLine(line[1], False))
|
||||
except ValueError:
|
||||
pass
|
||||
elif line[0] == "allow":
|
||||
if state != 0:
|
||||
entry.rulelines.append(RuleLine(line[1], True))
|
||||
state = 2
|
||||
try:
|
||||
entry.rulelines.append(RuleLine(line[1], True))
|
||||
except ValueError:
|
||||
pass
|
||||
elif line[0] == "crawl-delay":
|
||||
if state != 0:
|
||||
# before trying to convert to int we need to make
|
||||
@@ -164,9 +152,18 @@ class RobotFileParser:
|
||||
# so it doesn't matter where you place it in your file."
|
||||
# Therefore we do not change the state of the parser.
|
||||
self.sitemaps.append(line[1])
|
||||
if state == 2:
|
||||
if state != 0:
|
||||
self._add_entry(entry)
|
||||
|
||||
def _find_entry(self, useragent):
|
||||
entry = self.groups.get(useragent.lower())
|
||||
if entry is not None:
|
||||
return entry
|
||||
for entry in self.groups.values():
|
||||
if entry.applies_to(useragent):
|
||||
return entry
|
||||
return self.groups.get('*')
|
||||
|
||||
def can_fetch(self, useragent, url):
|
||||
"""using the parsed robots.txt decide if useragent can fetch url"""
|
||||
if self.disallow_all:
|
||||
@@ -179,43 +176,36 @@ class RobotFileParser:
|
||||
# calls can_fetch() before calling read().
|
||||
if not self.last_checked:
|
||||
return False
|
||||
# search for given user agent matches
|
||||
# the first match counts
|
||||
# TODO: The private API is used in order to preserve an empty query.
|
||||
# This is temporary until the public API starts supporting this feature.
|
||||
parsed_url = urllib.parse._urlsplit(url, '')
|
||||
url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
|
||||
url = normalize_path(url)
|
||||
url = normalize_uri(url)
|
||||
if not url:
|
||||
url = "/"
|
||||
for entry in self.entries:
|
||||
if entry.applies_to(useragent):
|
||||
return entry.allowance(url)
|
||||
# try the default entry last
|
||||
if self.default_entry:
|
||||
return self.default_entry.allowance(url)
|
||||
# agent not found ==> access granted
|
||||
return True
|
||||
if url == '/robots.txt':
|
||||
# The /robots.txt URI is implicitly allowed.
|
||||
return True
|
||||
entry = self._find_entry(useragent)
|
||||
if entry is None:
|
||||
return True
|
||||
return entry.allowance(url)
|
||||
|
||||
def crawl_delay(self, useragent):
|
||||
if not self.mtime():
|
||||
return None
|
||||
for entry in self.entries:
|
||||
if entry.applies_to(useragent):
|
||||
return entry.delay
|
||||
if self.default_entry:
|
||||
return self.default_entry.delay
|
||||
return None
|
||||
entry = self._find_entry(useragent)
|
||||
if entry is None:
|
||||
return None
|
||||
return entry.delay
|
||||
|
||||
def request_rate(self, useragent):
|
||||
if not self.mtime():
|
||||
return None
|
||||
for entry in self.entries:
|
||||
if entry.applies_to(useragent):
|
||||
return entry.req_rate
|
||||
if self.default_entry:
|
||||
return self.default_entry.req_rate
|
||||
return None
|
||||
entry = self._find_entry(useragent)
|
||||
if entry is None:
|
||||
return None
|
||||
return entry.req_rate
|
||||
|
||||
def site_maps(self):
|
||||
if not self.sitemaps:
|
||||
@@ -226,7 +216,7 @@ class RobotFileParser:
|
||||
entries = self.entries
|
||||
if self.default_entry is not None:
|
||||
entries = entries + [self.default_entry]
|
||||
return '\n\n'.join(map(str, entries))
|
||||
return '\n\n'.join(filter(None, map(str, entries)))
|
||||
|
||||
class RuleLine:
|
||||
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
||||
@@ -235,14 +225,42 @@ class RuleLine:
|
||||
if path == '' and not allowance:
|
||||
# an empty value means allow all
|
||||
allowance = True
|
||||
self.path = normalize_path(path)
|
||||
path = re.sub(r'[*]{2,}', '*', path)
|
||||
path = re.sub(r'[$][$*]+', '$', path)
|
||||
path = normalize_pattern(path)
|
||||
self.fullmatch = path.endswith('$')
|
||||
path = path.rstrip('$')
|
||||
if '$' in path:
|
||||
raise ValueError('$ not at the end of path')
|
||||
self.matcher = None
|
||||
if '*' in path:
|
||||
pattern = re.compile(translate_pattern(path), re.DOTALL)
|
||||
if self.fullmatch:
|
||||
self.matcher = pattern.fullmatch
|
||||
else:
|
||||
self.matcher = pattern.match
|
||||
self.path = path
|
||||
self.allowance = allowance
|
||||
|
||||
def applies_to(self, filename):
|
||||
return self.path == "*" or filename.startswith(self.path)
|
||||
# If the filename matches the rule, return the matching length plus 1.
|
||||
# If it does not match, return 0.
|
||||
if self.matcher is not None:
|
||||
m = self.matcher(filename)
|
||||
if m:
|
||||
return m.end() + 1
|
||||
else:
|
||||
if self.fullmatch:
|
||||
if filename == self.path:
|
||||
return len(self.path) + 1
|
||||
else:
|
||||
if filename.startswith(self.path):
|
||||
return len(self.path) + 1
|
||||
return 0
|
||||
|
||||
def __str__(self):
|
||||
return ("Allow" if self.allowance else "Disallow") + ": " + self.path
|
||||
return (("Allow" if self.allowance else "Disallow") + ": " + self.path
|
||||
+ ('$' if self.fullmatch else ''))
|
||||
|
||||
|
||||
class Entry:
|
||||
@@ -254,6 +272,8 @@ class Entry:
|
||||
self.req_rate = None
|
||||
|
||||
def __str__(self):
|
||||
if not self.useragents:
|
||||
return ''
|
||||
ret = []
|
||||
for agent in self.useragents:
|
||||
ret.append(f"User-agent: {agent}")
|
||||
@@ -262,27 +282,74 @@ class Entry:
|
||||
if self.req_rate is not None:
|
||||
rate = self.req_rate
|
||||
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
|
||||
ret.extend(map(str, self.rulelines))
|
||||
if self.rulelines:
|
||||
ret.extend(map(str, self.rulelines))
|
||||
else:
|
||||
ret.append("Allow:")
|
||||
return '\n'.join(ret)
|
||||
|
||||
def applies_to(self, useragent):
|
||||
"""check if this entry applies to the specified agent"""
|
||||
if useragent is None:
|
||||
return '*' in self.useragents
|
||||
# split the name token and make it lower case
|
||||
useragent = useragent.split("/")[0].lower()
|
||||
for agent in self.useragents:
|
||||
if agent == '*':
|
||||
# we have the catch-all agent
|
||||
return True
|
||||
agent = agent.lower()
|
||||
if agent in useragent:
|
||||
return True
|
||||
if agent != '*':
|
||||
agent = agent.lower()
|
||||
if agent in useragent:
|
||||
return True
|
||||
return False
|
||||
|
||||
def allowance(self, filename):
|
||||
"""Preconditions:
|
||||
- our agent applies to this entry
|
||||
- filename is URL encoded"""
|
||||
- filename is URL encoded
|
||||
"""
|
||||
best_match = -1
|
||||
allowance = True
|
||||
for line in self.rulelines:
|
||||
if line.applies_to(filename):
|
||||
return line.allowance
|
||||
return True
|
||||
m = line.applies_to(filename)
|
||||
if m:
|
||||
if m > best_match:
|
||||
best_match = m
|
||||
allowance = line.allowance
|
||||
elif m == best_match and not allowance:
|
||||
allowance = line.allowance
|
||||
return allowance
|
||||
|
||||
|
||||
def normalize(path):
|
||||
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
|
||||
return urllib.parse.quote(unquoted, errors='surrogateescape')
|
||||
|
||||
def normalize_uri(path):
|
||||
path, sep, query = path.partition('?')
|
||||
path = normalize(path)
|
||||
if sep:
|
||||
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
|
||||
path += '?' + query
|
||||
return path
|
||||
|
||||
def normalize_pattern(path):
|
||||
path, sep, query = path.partition('?')
|
||||
path = re.sub(r'[^*$]+', lambda m: normalize(m[0]), path)
|
||||
if sep:
|
||||
query = re.sub(r'[^=&*$]+', lambda m: normalize(m[0]), query)
|
||||
path += '?' + query
|
||||
return path
|
||||
|
||||
def translate_pattern(path):
|
||||
parts = list(map(re.escape, path.split('*')))
|
||||
for i in range(1, len(parts)-1):
|
||||
parts[i] = f'(?>.*?{parts[i]})'
|
||||
parts[-1] = f'.*{parts[-1]}'
|
||||
return ''.join(parts)
|
||||
|
||||
def merge_entries(e1, e2):
|
||||
entry = Entry()
|
||||
entry.useragents = list(filter(set(e2.useragents).__contains__, e1.useragents))
|
||||
entry.rulelines = e1.rulelines + e2.rulelines
|
||||
entry.delay = e1.delay if e2.delay is None else e2.delay
|
||||
entry.req_rate = e1.req_rate if e2.req_rate is None else e2.req_rate
|
||||
return entry
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
Support :rfc:`9309` in :mod:`urllib.robotparser`.
|
||||
Reference in New Issue
Block a user