mirror of
https://github.com/python/cpython.git
synced 2026-05-06 04:37:33 -04:00
gh-138907: Support RFC 9309 in robotparser (GH-138908)
* empty lines are always ignored instead of separating groups * the "user-agent" line after a rule starts a new group * groups matching the same user agent are now merged * the rule with the longest match wins instead of the first matching rule * in case of equal matches, the “Allow” rule wins over “Disallow” * special characters “$” and “*” are now supported in rules * prefer full match for user agent
This commit is contained in:
@@ -18,7 +18,7 @@
|
|||||||
This module provides a single class, :class:`RobotFileParser`, which answers
|
This module provides a single class, :class:`RobotFileParser`, which answers
|
||||||
questions about whether or not a particular user agent can fetch a URL on the
|
questions about whether or not a particular user agent can fetch a URL on the
|
||||||
website that published the :file:`robots.txt` file. For more details on the
|
website that published the :file:`robots.txt` file. For more details on the
|
||||||
structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
|
structure of :file:`robots.txt` files, see :rfc:`9309`.
|
||||||
|
|
||||||
|
|
||||||
.. class:: RobotFileParser(url='')
|
.. class:: RobotFileParser(url='')
|
||||||
|
|||||||
+301
-39
@@ -15,14 +15,18 @@ class BaseRobotTest:
|
|||||||
good = []
|
good = []
|
||||||
bad = []
|
bad = []
|
||||||
site_maps = None
|
site_maps = None
|
||||||
|
expected_output = None
|
||||||
|
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
super().__init_subclass__()
|
super().__init_subclass__()
|
||||||
# Remove tests that do nothing.
|
# Remove tests that do nothing.
|
||||||
if not cls.good:
|
if issubclass(cls, unittest.TestCase):
|
||||||
cls.test_good_urls = None
|
if not cls.good:
|
||||||
if not cls.bad:
|
cls.test_good_urls = None
|
||||||
cls.test_bad_urls = None
|
if not cls.bad:
|
||||||
|
cls.test_bad_urls = None
|
||||||
|
if cls.expected_output is None:
|
||||||
|
cls.test_string_formatting = None
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
lines = io.StringIO(self.robots_txt).readlines()
|
lines = io.StringIO(self.robots_txt).readlines()
|
||||||
@@ -50,6 +54,8 @@ class BaseRobotTest:
|
|||||||
def test_site_maps(self):
|
def test_site_maps(self):
|
||||||
self.assertEqual(self.parser.site_maps(), self.site_maps)
|
self.assertEqual(self.parser.site_maps(), self.site_maps)
|
||||||
|
|
||||||
|
def test_string_formatting(self):
|
||||||
|
self.assertEqual(str(self.parser), self.expected_output)
|
||||||
|
|
||||||
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
@@ -61,6 +67,56 @@ Disallow: /foo.html
|
|||||||
good = ['/', '/test.html']
|
good = ['/', '/test.html']
|
||||||
bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
|
bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']
|
||||||
|
|
||||||
|
class SimpleExampleTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
# Example from RFC 9309, section 5.1.
|
||||||
|
robots_txt = """\
|
||||||
|
User-Agent: *
|
||||||
|
Disallow: *.gif$
|
||||||
|
Disallow: /example/
|
||||||
|
Allow: /publications/
|
||||||
|
|
||||||
|
User-Agent: foobot
|
||||||
|
Disallow:/
|
||||||
|
Allow:/example/page.html
|
||||||
|
Allow:/example/allowed.gif
|
||||||
|
|
||||||
|
User-Agent: barbot
|
||||||
|
User-Agent: bazbot
|
||||||
|
Disallow: /example/page.html
|
||||||
|
|
||||||
|
User-Agent: quxbot
|
||||||
|
"""
|
||||||
|
good = [
|
||||||
|
'/', '/publications/',
|
||||||
|
('foobot', '/example/page.html'), ('foobot', '/example/allowed.gif'),
|
||||||
|
('barbot', '/'), ('barbot', '/example/'),
|
||||||
|
('barbot', '/example/allowed.gif'),
|
||||||
|
('barbot', '/example/disallowed.gif'),
|
||||||
|
('barbot', '/publications/'),
|
||||||
|
('barbot', '/publications/allowed.gif'),
|
||||||
|
('bazbot', '/'), ('bazbot', '/example/'),
|
||||||
|
('bazbot', '/example/allowed.gif'),
|
||||||
|
('bazbot', '/example/disallowed.gif'),
|
||||||
|
('bazbot', '/publications/'),
|
||||||
|
('bazbot', '/publications/allowed.gif'),
|
||||||
|
('quxbot', '/'), ('quxbot', '/example/'),
|
||||||
|
('quxbot', '/example/page.html'), ('quxbot', '/example/allowed.gif'),
|
||||||
|
('quxbot', '/example/disallowed.gif'),
|
||||||
|
('quxbot', '/publications/'),
|
||||||
|
('quxbot', '/publications/allowed.gif'),
|
||||||
|
]
|
||||||
|
bad = [
|
||||||
|
'/example/', '/example/page.html', '/example/allowed.gif',
|
||||||
|
'/example/disallowed.gif',
|
||||||
|
'/publications/allowed.gif',
|
||||||
|
('foobot', '/'), ('foobot', '/example/'),
|
||||||
|
('foobot', '/example/disallowed.gif'),
|
||||||
|
('foobot', '/publications/'),
|
||||||
|
('foobot', '/publications/allowed.gif'),
|
||||||
|
('barbot', '/example/page.html'),
|
||||||
|
('bazbot', '/example/page.html'),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
|
class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
@@ -102,7 +158,7 @@ class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
|
|||||||
User-agent: *
|
User-agent: *
|
||||||
Disallow: /
|
Disallow: /
|
||||||
"""
|
"""
|
||||||
good = []
|
good = ['/robots.txt']
|
||||||
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
|
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
|
||||||
|
|
||||||
|
|
||||||
@@ -137,6 +193,7 @@ class BaseRequestRateTest(BaseRobotTest):
|
|||||||
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
|
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
|
||||||
robots_txt = ''
|
robots_txt = ''
|
||||||
good = ['/foo']
|
good = ['/foo']
|
||||||
|
expected_output = ''
|
||||||
|
|
||||||
|
|
||||||
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
|
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
|
||||||
@@ -203,35 +260,209 @@ Request-rate: whale/banana
|
|||||||
|
|
||||||
|
|
||||||
class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
|
class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
|
||||||
# the order of User-agent should be correct. note
|
# the order of User-agent should not matter
|
||||||
# that this file is incorrect because "Googlebot" is a
|
|
||||||
# substring of "Googlebot-Mobile"
|
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
User-agent: Googlebot
|
User-agent: Googlebot
|
||||||
Disallow: /
|
Disallow: /
|
||||||
|
Allow: /folder1/
|
||||||
|
|
||||||
User-agent: Googlebot-Mobile
|
User-agent: Googlebot-Mobile
|
||||||
Allow: /
|
Allow: /
|
||||||
|
Disallow: /folder1/
|
||||||
"""
|
"""
|
||||||
agent = 'Googlebot'
|
agent = 'Googlebot'
|
||||||
bad = ['/something.jpg']
|
bad = ['/something.jpg']
|
||||||
|
good = ['/folder1/myfile.html']
|
||||||
|
|
||||||
|
|
||||||
class UserAgentGoogleMobileTest(UserAgentOrderingTest):
|
class UserAgentGoogleMobileTest(UserAgentOrderingTest):
|
||||||
agent = 'Googlebot-Mobile'
|
agent = 'Googlebot-mobile'
|
||||||
|
bad = ['/folder1/myfile.html']
|
||||||
|
good = ['/something.jpg']
|
||||||
|
|
||||||
|
|
||||||
class GoogleURLOrderingTest(BaseRobotTest, unittest.TestCase):
|
class LongestMatchTest(BaseRobotTest, unittest.TestCase):
|
||||||
# Google also got the order wrong. You need
|
# Based on example from RFC 9309, section 5.2.
|
||||||
# to specify the URLs from more specific to more general
|
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
User-agent: Googlebot
|
User-agent: *
|
||||||
Allow: /folder1/myfile.html
|
Allow: /example/page/
|
||||||
Disallow: /folder1/
|
Disallow: /example/page/disallowed.gif
|
||||||
|
Allow: /example/
|
||||||
"""
|
"""
|
||||||
agent = 'googlebot'
|
good = ['/example/', '/example/page/']
|
||||||
good = ['/folder1/myfile.html']
|
bad = ['/example/page/disallowed.gif']
|
||||||
bad = ['/folder1/anotherfile.html']
|
|
||||||
|
|
||||||
|
class LongestMatchWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
User-agent: *
|
||||||
|
Allow: /example/page/
|
||||||
|
Disallow: *.gif
|
||||||
|
Allow: /example/
|
||||||
|
"""
|
||||||
|
good = ['/example/', '/example/page/']
|
||||||
|
bad = ['/example/page/disallowed.gif', '/x.gif']
|
||||||
|
|
||||||
|
|
||||||
|
class AllowWinsEqualMatchTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /spam
|
||||||
|
Allow: /spam
|
||||||
|
Disallow: /spam
|
||||||
|
"""
|
||||||
|
good = ['/spam', '/spam/']
|
||||||
|
|
||||||
|
|
||||||
|
class AllowWinsEqualFullMatchTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /spam
|
||||||
|
Allow: /spam$
|
||||||
|
Disallow: /spam
|
||||||
|
Disallow: /eggs$
|
||||||
|
Allow: /eggs
|
||||||
|
Disallow: /eggs$
|
||||||
|
"""
|
||||||
|
good = ['/spam', '/eggs', '/eggs/']
|
||||||
|
bad = ['/spam/']
|
||||||
|
|
||||||
|
|
||||||
|
class AllowWinsEqualMatchWildcardTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /spam
|
||||||
|
Allow: *am
|
||||||
|
Disallow: /spam
|
||||||
|
Disallow: *gs
|
||||||
|
Allow: /eggs
|
||||||
|
Disallow: *gs
|
||||||
|
"""
|
||||||
|
good = ['/spam', '/eggs', '/spam/', '/eggs/']
|
||||||
|
|
||||||
|
|
||||||
|
class MergeGroupsTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
User-agent: spambot
|
||||||
|
Disallow: /some/path
|
||||||
|
|
||||||
|
User-agent: spambot
|
||||||
|
Disallow: /another/path
|
||||||
|
"""
|
||||||
|
agent = 'spambot'
|
||||||
|
bad = ['/some/path', '/another/path']
|
||||||
|
|
||||||
|
|
||||||
|
class UserAgentStartsGroupTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
User-agent: spambot
|
||||||
|
Disallow: /some/path
|
||||||
|
User-agent: eggsbot
|
||||||
|
Disallow: /another/path
|
||||||
|
"""
|
||||||
|
good = [('spambot', '/'), ('spambot', '/another/path'),
|
||||||
|
('eggsbot', '/'), ('eggsbot', '/some/path')]
|
||||||
|
bad = [('spambot', '/some/path'), ('eggsbot', '/another/path')]
|
||||||
|
expected_output = """\
|
||||||
|
User-agent: spambot
|
||||||
|
Disallow: /some/path
|
||||||
|
|
||||||
|
User-agent: eggsbot
|
||||||
|
Disallow: /another/path\
|
||||||
|
"""
|
||||||
|
|
||||||
|
class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
User-agent: spambot
|
||||||
|
|
||||||
|
User-agent: eggsbot
|
||||||
|
Disallow: /some/path
|
||||||
|
|
||||||
|
Disallow: /another/path
|
||||||
|
"""
|
||||||
|
good = [('spambot', '/'), ('eggsbot', '/')]
|
||||||
|
bad = [
|
||||||
|
('spambot', '/some/path'), ('spambot', '/another/path'),
|
||||||
|
('eggsbot', '/some/path'), ('eggsbot', '/another/path'),
|
||||||
|
]
|
||||||
|
expected_output = """\
|
||||||
|
User-agent: spambot
|
||||||
|
User-agent: eggsbot
|
||||||
|
Disallow: /some/path
|
||||||
|
Disallow: /another/path\
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class IgnoreRulesWithoutUserAgentTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
Disallow: /some/path
|
||||||
|
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /another/path
|
||||||
|
"""
|
||||||
|
good = ['/', '/some/path']
|
||||||
|
bad = ['/another/path']
|
||||||
|
expected_output = """\
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /another/path\
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class EmptyGroupTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /some/path
|
||||||
|
|
||||||
|
User-agent: spambot
|
||||||
|
"""
|
||||||
|
agent = 'spambot'
|
||||||
|
good = ['/', '/some/path']
|
||||||
|
expected_output = """\
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /some/path
|
||||||
|
|
||||||
|
User-agent: spambot
|
||||||
|
Allow:\
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class WeirdPathTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
robots_txt = f"""\
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /a$$$
|
||||||
|
Disallow: /b$z
|
||||||
|
Disallow: /c***
|
||||||
|
Disallow: /d***z
|
||||||
|
Disallow: /e*$**$$
|
||||||
|
Disallow: /f*$**$$z
|
||||||
|
Disallow: /g$*$$**
|
||||||
|
Disallow: /h$*$$**z
|
||||||
|
"""
|
||||||
|
good = ['/ax', '/a$$', '/b', '/bz', '/b$z', '/d', '/f', '/fz',
|
||||||
|
'/f$$$z', '/fx$y$$z', '/gx', '/g$$$', '/g$x$$y', '/h', '/hz',
|
||||||
|
'/h$$$z', '/h$x$$yz']
|
||||||
|
bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy',
|
||||||
|
'/e$$', '/ex$y$', '/g']
|
||||||
|
expected_output = """\
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /a$
|
||||||
|
Disallow: /c*
|
||||||
|
Disallow: /d*z
|
||||||
|
Disallow: /e*$
|
||||||
|
Disallow: /g$\
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class PathWithManyWildcardsTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
# This test would take many years if use naive translation to regular
|
||||||
|
# expression (* -> .*).
|
||||||
|
N = 50
|
||||||
|
robots_txt = f"""\
|
||||||
|
User-agent: *
|
||||||
|
Disallow: /{'*a'*N}*b
|
||||||
|
"""
|
||||||
|
good = ['/' + 'a'*N + 'a']
|
||||||
|
bad = ['/' + 'a'*N + 'b']
|
||||||
|
|
||||||
|
|
||||||
class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
|
class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
|
||||||
@@ -245,25 +476,13 @@ Disallow: /yet/one/path?name=value&more
|
|||||||
good = ['/some/path', '/some/path?',
|
good = ['/some/path', '/some/path?',
|
||||||
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
|
'/some/path%3Fname=value', '/some/path?name%3Dvalue',
|
||||||
'/another/path', '/another/path%3F',
|
'/another/path', '/another/path%3F',
|
||||||
'/yet/one/path?name=value%26more']
|
'/yet/one/path?name=value%26more',
|
||||||
|
'/some/pathxname=value']
|
||||||
bad = ['/some/path?name=value'
|
bad = ['/some/path?name=value'
|
||||||
'/another/path?', '/another/path?name=value',
|
'/another/path?', '/another/path?name=value',
|
||||||
'/yet/one/path?name=value&more']
|
'/yet/one/path?name=value&more']
|
||||||
|
|
||||||
|
|
||||||
class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
|
|
||||||
# obey first * entry (#4108)
|
|
||||||
robots_txt = """\
|
|
||||||
User-agent: *
|
|
||||||
Disallow: /some/path
|
|
||||||
|
|
||||||
User-agent: *
|
|
||||||
Disallow: /another/path
|
|
||||||
"""
|
|
||||||
good = ['/another/path']
|
|
||||||
bad = ['/some/path']
|
|
||||||
|
|
||||||
|
|
||||||
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
|
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
User-agent: *
|
User-agent: *
|
||||||
@@ -365,17 +584,60 @@ Disallow: /some/path
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
expected_output = """\
|
expected_output = """\
|
||||||
User-agent: cybermapper
|
|
||||||
Disallow: /some/path
|
|
||||||
|
|
||||||
User-agent: *
|
User-agent: *
|
||||||
Crawl-delay: 1
|
Crawl-delay: 1
|
||||||
Request-rate: 3/15
|
Request-rate: 3/15
|
||||||
Disallow: /cyberworld/map/\
|
Disallow: /cyberworld/map/
|
||||||
|
|
||||||
|
User-agent: cybermapper
|
||||||
|
Disallow: /some/path\
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def test_string_formatting(self):
|
|
||||||
self.assertEqual(str(self.parser), self.expected_output)
|
class ConstructedStringFormattingTest(unittest.TestCase):
|
||||||
|
def test_empty(self):
|
||||||
|
parser = urllib.robotparser.RobotFileParser()
|
||||||
|
self.assertEqual(str(parser), '')
|
||||||
|
|
||||||
|
def test_group_without_rules(self):
|
||||||
|
parser = urllib.robotparser.RobotFileParser()
|
||||||
|
entry = urllib.robotparser.Entry()
|
||||||
|
entry.useragents = ['spambot']
|
||||||
|
parser._add_entry(entry)
|
||||||
|
entry = urllib.robotparser.Entry()
|
||||||
|
entry.useragents = ['hambot']
|
||||||
|
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
|
||||||
|
parser._add_entry(entry)
|
||||||
|
entry = urllib.robotparser.Entry()
|
||||||
|
entry.useragents = ['eggsbot']
|
||||||
|
parser._add_entry(entry)
|
||||||
|
self.assertEqual(str(parser), """\
|
||||||
|
User-agent: spambot
|
||||||
|
Allow:
|
||||||
|
|
||||||
|
User-agent: hambot
|
||||||
|
Disallow: /ham
|
||||||
|
|
||||||
|
User-agent: eggsbot
|
||||||
|
Allow:\
|
||||||
|
""")
|
||||||
|
|
||||||
|
def test_group_without_user_agent(self):
|
||||||
|
parser = urllib.robotparser.RobotFileParser()
|
||||||
|
entry = urllib.robotparser.Entry()
|
||||||
|
entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
|
||||||
|
parser._add_entry(entry)
|
||||||
|
entry = urllib.robotparser.Entry()
|
||||||
|
entry.useragents = ['spambot']
|
||||||
|
entry.rulelines = [urllib.robotparser.RuleLine('/spam', False)]
|
||||||
|
parser._add_entry(entry)
|
||||||
|
entry = urllib.robotparser.Entry()
|
||||||
|
entry.rulelines = [urllib.robotparser.RuleLine('/eggs', False)]
|
||||||
|
parser._add_entry(entry)
|
||||||
|
self.assertEqual(str(parser), """\
|
||||||
|
User-agent: spambot
|
||||||
|
Disallow: /spam\
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
@unittest.skipUnless(
|
@unittest.skipUnless(
|
||||||
@@ -495,7 +757,7 @@ class NetworkTestCase(unittest.TestCase):
|
|||||||
def test_can_fetch(self):
|
def test_can_fetch(self):
|
||||||
self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
|
self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
|
||||||
self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
|
self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
|
||||||
self.assertFalse(self.parser.can_fetch('Nutch', self.url('brian')))
|
self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))
|
||||||
self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
|
self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
|
||||||
self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
|
self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
|
||||||
self.assertTrue(self.parser.can_fetch('*', self.base_url))
|
self.assertTrue(self.parser.can_fetch('*', self.base_url))
|
||||||
|
|||||||
+138
-71
@@ -7,7 +7,7 @@
|
|||||||
2) PSF license for Python 2.2
|
2) PSF license for Python 2.2
|
||||||
|
|
||||||
The robots.txt Exclusion Protocol is implemented as specified in
|
The robots.txt Exclusion Protocol is implemented as specified in
|
||||||
http://www.robotstxt.org/norobots-rfc.txt
|
RFC 9309
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import collections
|
import collections
|
||||||
@@ -21,19 +21,6 @@ __all__ = ["RobotFileParser"]
|
|||||||
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
|
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
|
||||||
|
|
||||||
|
|
||||||
def normalize(path):
|
|
||||||
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
|
|
||||||
return urllib.parse.quote(unquoted, errors='surrogateescape')
|
|
||||||
|
|
||||||
def normalize_path(path):
|
|
||||||
path, sep, query = path.partition('?')
|
|
||||||
path = normalize(path)
|
|
||||||
if sep:
|
|
||||||
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
|
|
||||||
path += '?' + query
|
|
||||||
return path
|
|
||||||
|
|
||||||
|
|
||||||
class RobotFileParser:
|
class RobotFileParser:
|
||||||
""" This class provides a set of methods to read, parse and answer
|
""" This class provides a set of methods to read, parse and answer
|
||||||
questions about a single robots.txt file.
|
questions about a single robots.txt file.
|
||||||
@@ -42,6 +29,7 @@ class RobotFileParser:
|
|||||||
|
|
||||||
def __init__(self, url=''):
|
def __init__(self, url=''):
|
||||||
self.entries = []
|
self.entries = []
|
||||||
|
self.groups = {}
|
||||||
self.sitemaps = []
|
self.sitemaps = []
|
||||||
self.default_entry = None
|
self.default_entry = None
|
||||||
self.disallow_all = False
|
self.disallow_all = False
|
||||||
@@ -86,13 +74,13 @@ class RobotFileParser:
|
|||||||
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
|
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
|
||||||
|
|
||||||
def _add_entry(self, entry):
|
def _add_entry(self, entry):
|
||||||
if "*" in entry.useragents:
|
self.entries.append(entry)
|
||||||
# the default entry is considered last
|
for agent in entry.useragents:
|
||||||
if self.default_entry is None:
|
agent = agent.lower()
|
||||||
# the first default entry wins
|
if agent not in self.groups:
|
||||||
self.default_entry = entry
|
self.groups[agent] = entry
|
||||||
else:
|
else:
|
||||||
self.entries.append(entry)
|
self.groups[agent] = merge_entries(self.groups[agent], entry)
|
||||||
|
|
||||||
def parse(self, lines):
|
def parse(self, lines):
|
||||||
"""Parse the input lines from a robots.txt file.
|
"""Parse the input lines from a robots.txt file.
|
||||||
@@ -100,6 +88,7 @@ class RobotFileParser:
|
|||||||
We allow that a user-agent: line is not preceded by
|
We allow that a user-agent: line is not preceded by
|
||||||
one or more blank lines.
|
one or more blank lines.
|
||||||
"""
|
"""
|
||||||
|
entries = []
|
||||||
# states:
|
# states:
|
||||||
# 0: start state
|
# 0: start state
|
||||||
# 1: saw user-agent line
|
# 1: saw user-agent line
|
||||||
@@ -109,14 +98,6 @@ class RobotFileParser:
|
|||||||
|
|
||||||
self.modified()
|
self.modified()
|
||||||
for line in lines:
|
for line in lines:
|
||||||
if not line:
|
|
||||||
if state == 1:
|
|
||||||
entry = Entry()
|
|
||||||
state = 0
|
|
||||||
elif state == 2:
|
|
||||||
self._add_entry(entry)
|
|
||||||
entry = Entry()
|
|
||||||
state = 0
|
|
||||||
# remove optional comment and strip line
|
# remove optional comment and strip line
|
||||||
i = line.find('#')
|
i = line.find('#')
|
||||||
if i >= 0:
|
if i >= 0:
|
||||||
@@ -132,16 +113,23 @@ class RobotFileParser:
|
|||||||
if state == 2:
|
if state == 2:
|
||||||
self._add_entry(entry)
|
self._add_entry(entry)
|
||||||
entry = Entry()
|
entry = Entry()
|
||||||
entry.useragents.append(line[1])
|
product_token = line[1]
|
||||||
|
entry.useragents.append(product_token)
|
||||||
state = 1
|
state = 1
|
||||||
elif line[0] == "disallow":
|
elif line[0] == "disallow":
|
||||||
if state != 0:
|
if state != 0:
|
||||||
entry.rulelines.append(RuleLine(line[1], False))
|
|
||||||
state = 2
|
state = 2
|
||||||
|
try:
|
||||||
|
entry.rulelines.append(RuleLine(line[1], False))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
elif line[0] == "allow":
|
elif line[0] == "allow":
|
||||||
if state != 0:
|
if state != 0:
|
||||||
entry.rulelines.append(RuleLine(line[1], True))
|
|
||||||
state = 2
|
state = 2
|
||||||
|
try:
|
||||||
|
entry.rulelines.append(RuleLine(line[1], True))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
elif line[0] == "crawl-delay":
|
elif line[0] == "crawl-delay":
|
||||||
if state != 0:
|
if state != 0:
|
||||||
# before trying to convert to int we need to make
|
# before trying to convert to int we need to make
|
||||||
@@ -164,9 +152,18 @@ class RobotFileParser:
|
|||||||
# so it doesn't matter where you place it in your file."
|
# so it doesn't matter where you place it in your file."
|
||||||
# Therefore we do not change the state of the parser.
|
# Therefore we do not change the state of the parser.
|
||||||
self.sitemaps.append(line[1])
|
self.sitemaps.append(line[1])
|
||||||
if state == 2:
|
if state != 0:
|
||||||
self._add_entry(entry)
|
self._add_entry(entry)
|
||||||
|
|
||||||
|
def _find_entry(self, useragent):
|
||||||
|
entry = self.groups.get(useragent.lower())
|
||||||
|
if entry is not None:
|
||||||
|
return entry
|
||||||
|
for entry in self.groups.values():
|
||||||
|
if entry.applies_to(useragent):
|
||||||
|
return entry
|
||||||
|
return self.groups.get('*')
|
||||||
|
|
||||||
def can_fetch(self, useragent, url):
|
def can_fetch(self, useragent, url):
|
||||||
"""using the parsed robots.txt decide if useragent can fetch url"""
|
"""using the parsed robots.txt decide if useragent can fetch url"""
|
||||||
if self.disallow_all:
|
if self.disallow_all:
|
||||||
@@ -179,43 +176,36 @@ class RobotFileParser:
|
|||||||
# calls can_fetch() before calling read().
|
# calls can_fetch() before calling read().
|
||||||
if not self.last_checked:
|
if not self.last_checked:
|
||||||
return False
|
return False
|
||||||
# search for given user agent matches
|
|
||||||
# the first match counts
|
|
||||||
# TODO: The private API is used in order to preserve an empty query.
|
# TODO: The private API is used in order to preserve an empty query.
|
||||||
# This is temporary until the public API starts supporting this feature.
|
# This is temporary until the public API starts supporting this feature.
|
||||||
parsed_url = urllib.parse._urlsplit(url, '')
|
parsed_url = urllib.parse._urlsplit(url, '')
|
||||||
url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
|
url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
|
||||||
url = normalize_path(url)
|
url = normalize_uri(url)
|
||||||
if not url:
|
if not url:
|
||||||
url = "/"
|
url = "/"
|
||||||
for entry in self.entries:
|
if url == '/robots.txt':
|
||||||
if entry.applies_to(useragent):
|
# The /robots.txt URI is implicitly allowed.
|
||||||
return entry.allowance(url)
|
return True
|
||||||
# try the default entry last
|
entry = self._find_entry(useragent)
|
||||||
if self.default_entry:
|
if entry is None:
|
||||||
return self.default_entry.allowance(url)
|
return True
|
||||||
# agent not found ==> access granted
|
return entry.allowance(url)
|
||||||
return True
|
|
||||||
|
|
||||||
def crawl_delay(self, useragent):
|
def crawl_delay(self, useragent):
|
||||||
if not self.mtime():
|
if not self.mtime():
|
||||||
return None
|
return None
|
||||||
for entry in self.entries:
|
entry = self._find_entry(useragent)
|
||||||
if entry.applies_to(useragent):
|
if entry is None:
|
||||||
return entry.delay
|
return None
|
||||||
if self.default_entry:
|
return entry.delay
|
||||||
return self.default_entry.delay
|
|
||||||
return None
|
|
||||||
|
|
||||||
def request_rate(self, useragent):
|
def request_rate(self, useragent):
|
||||||
if not self.mtime():
|
if not self.mtime():
|
||||||
return None
|
return None
|
||||||
for entry in self.entries:
|
entry = self._find_entry(useragent)
|
||||||
if entry.applies_to(useragent):
|
if entry is None:
|
||||||
return entry.req_rate
|
return None
|
||||||
if self.default_entry:
|
return entry.req_rate
|
||||||
return self.default_entry.req_rate
|
|
||||||
return None
|
|
||||||
|
|
||||||
def site_maps(self):
|
def site_maps(self):
|
||||||
if not self.sitemaps:
|
if not self.sitemaps:
|
||||||
@@ -226,7 +216,7 @@ class RobotFileParser:
|
|||||||
entries = self.entries
|
entries = self.entries
|
||||||
if self.default_entry is not None:
|
if self.default_entry is not None:
|
||||||
entries = entries + [self.default_entry]
|
entries = entries + [self.default_entry]
|
||||||
return '\n\n'.join(map(str, entries))
|
return '\n\n'.join(filter(None, map(str, entries)))
|
||||||
|
|
||||||
class RuleLine:
|
class RuleLine:
|
||||||
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
|
||||||
@@ -235,14 +225,42 @@ class RuleLine:
|
|||||||
if path == '' and not allowance:
|
if path == '' and not allowance:
|
||||||
# an empty value means allow all
|
# an empty value means allow all
|
||||||
allowance = True
|
allowance = True
|
||||||
self.path = normalize_path(path)
|
path = re.sub(r'[*]{2,}', '*', path)
|
||||||
|
path = re.sub(r'[$][$*]+', '$', path)
|
||||||
|
path = normalize_pattern(path)
|
||||||
|
self.fullmatch = path.endswith('$')
|
||||||
|
path = path.rstrip('$')
|
||||||
|
if '$' in path:
|
||||||
|
raise ValueError('$ not at the end of path')
|
||||||
|
self.matcher = None
|
||||||
|
if '*' in path:
|
||||||
|
pattern = re.compile(translate_pattern(path), re.DOTALL)
|
||||||
|
if self.fullmatch:
|
||||||
|
self.matcher = pattern.fullmatch
|
||||||
|
else:
|
||||||
|
self.matcher = pattern.match
|
||||||
|
self.path = path
|
||||||
self.allowance = allowance
|
self.allowance = allowance
|
||||||
|
|
||||||
def applies_to(self, filename):
|
def applies_to(self, filename):
|
||||||
return self.path == "*" or filename.startswith(self.path)
|
# If the filename matches the rule, return the matching length plus 1.
|
||||||
|
# If it does not match, return 0.
|
||||||
|
if self.matcher is not None:
|
||||||
|
m = self.matcher(filename)
|
||||||
|
if m:
|
||||||
|
return m.end() + 1
|
||||||
|
else:
|
||||||
|
if self.fullmatch:
|
||||||
|
if filename == self.path:
|
||||||
|
return len(self.path) + 1
|
||||||
|
else:
|
||||||
|
if filename.startswith(self.path):
|
||||||
|
return len(self.path) + 1
|
||||||
|
return 0
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return ("Allow" if self.allowance else "Disallow") + ": " + self.path
|
return (("Allow" if self.allowance else "Disallow") + ": " + self.path
|
||||||
|
+ ('$' if self.fullmatch else ''))
|
||||||
|
|
||||||
|
|
||||||
class Entry:
|
class Entry:
|
||||||
@@ -254,6 +272,8 @@ class Entry:
|
|||||||
self.req_rate = None
|
self.req_rate = None
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
|
if not self.useragents:
|
||||||
|
return ''
|
||||||
ret = []
|
ret = []
|
||||||
for agent in self.useragents:
|
for agent in self.useragents:
|
||||||
ret.append(f"User-agent: {agent}")
|
ret.append(f"User-agent: {agent}")
|
||||||
@@ -262,27 +282,74 @@ class Entry:
|
|||||||
if self.req_rate is not None:
|
if self.req_rate is not None:
|
||||||
rate = self.req_rate
|
rate = self.req_rate
|
||||||
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
|
ret.append(f"Request-rate: {rate.requests}/{rate.seconds}")
|
||||||
ret.extend(map(str, self.rulelines))
|
if self.rulelines:
|
||||||
|
ret.extend(map(str, self.rulelines))
|
||||||
|
else:
|
||||||
|
ret.append("Allow:")
|
||||||
return '\n'.join(ret)
|
return '\n'.join(ret)
|
||||||
|
|
||||||
def applies_to(self, useragent):
|
def applies_to(self, useragent):
|
||||||
"""check if this entry applies to the specified agent"""
|
"""check if this entry applies to the specified agent"""
|
||||||
|
if useragent is None:
|
||||||
|
return '*' in self.useragents
|
||||||
# split the name token and make it lower case
|
# split the name token and make it lower case
|
||||||
useragent = useragent.split("/")[0].lower()
|
useragent = useragent.split("/")[0].lower()
|
||||||
for agent in self.useragents:
|
for agent in self.useragents:
|
||||||
if agent == '*':
|
if agent != '*':
|
||||||
# we have the catch-all agent
|
agent = agent.lower()
|
||||||
return True
|
if agent in useragent:
|
||||||
agent = agent.lower()
|
return True
|
||||||
if agent in useragent:
|
|
||||||
return True
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def allowance(self, filename):
|
def allowance(self, filename):
|
||||||
"""Preconditions:
|
"""Preconditions:
|
||||||
- our agent applies to this entry
|
- our agent applies to this entry
|
||||||
- filename is URL encoded"""
|
- filename is URL encoded
|
||||||
|
"""
|
||||||
|
best_match = -1
|
||||||
|
allowance = True
|
||||||
for line in self.rulelines:
|
for line in self.rulelines:
|
||||||
if line.applies_to(filename):
|
m = line.applies_to(filename)
|
||||||
return line.allowance
|
if m:
|
||||||
return True
|
if m > best_match:
|
||||||
|
best_match = m
|
||||||
|
allowance = line.allowance
|
||||||
|
elif m == best_match and not allowance:
|
||||||
|
allowance = line.allowance
|
||||||
|
return allowance
|
||||||
|
|
||||||
|
|
||||||
|
def normalize(path):
|
||||||
|
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
|
||||||
|
return urllib.parse.quote(unquoted, errors='surrogateescape')
|
||||||
|
|
||||||
|
def normalize_uri(path):
|
||||||
|
path, sep, query = path.partition('?')
|
||||||
|
path = normalize(path)
|
||||||
|
if sep:
|
||||||
|
query = re.sub(r'[^=&]+', lambda m: normalize(m[0]), query)
|
||||||
|
path += '?' + query
|
||||||
|
return path
|
||||||
|
|
||||||
|
def normalize_pattern(path):
|
||||||
|
path, sep, query = path.partition('?')
|
||||||
|
path = re.sub(r'[^*$]+', lambda m: normalize(m[0]), path)
|
||||||
|
if sep:
|
||||||
|
query = re.sub(r'[^=&*$]+', lambda m: normalize(m[0]), query)
|
||||||
|
path += '?' + query
|
||||||
|
return path
|
||||||
|
|
||||||
|
def translate_pattern(path):
|
||||||
|
parts = list(map(re.escape, path.split('*')))
|
||||||
|
for i in range(1, len(parts)-1):
|
||||||
|
parts[i] = f'(?>.*?{parts[i]})'
|
||||||
|
parts[-1] = f'.*{parts[-1]}'
|
||||||
|
return ''.join(parts)
|
||||||
|
|
||||||
|
def merge_entries(e1, e2):
|
||||||
|
entry = Entry()
|
||||||
|
entry.useragents = list(filter(set(e2.useragents).__contains__, e1.useragents))
|
||||||
|
entry.rulelines = e1.rulelines + e2.rulelines
|
||||||
|
entry.delay = e1.delay if e2.delay is None else e2.delay
|
||||||
|
entry.req_rate = e1.req_rate if e2.req_rate is None else e2.req_rate
|
||||||
|
return entry
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
Support :rfc:`9309` in :mod:`urllib.robotparser`.
|
||||||
Reference in New Issue
Block a user