cpython/Lib/test/test_robotparser.py

import io
import os
import threading
import unittest
import urllib.robotparser
from test import support
from test.support import socket_helper
from test.support import threading_helper
from http.server import BaseHTTPRequestHandler, HTTPServer


class BaseRobotTest:
    robots_txt = ''
    agent = 'test_robotparser'
    good = []
    bad = []
    site_maps = None
    expected_output = None

    def __init_subclass__(cls):
        super().__init_subclass__()
        # Remove tests that do nothing.
        if issubclass(cls, unittest.TestCase):
            if not cls.good:
                cls.test_good_urls = None
            if not cls.bad:
                cls.test_bad_urls = None
            if cls.expected_output is None:
                cls.test_string_formatting = None

    def setUp(self):
        lines = io.StringIO(self.robots_txt).readlines()
        self.parser = urllib.robotparser.RobotFileParser()
        self.parser.parse(lines)

    def get_agent_and_url(self, url):
        if isinstance(url, tuple):
            agent, url = url
            return agent, url
        return self.agent, url

    def test_good_urls(self):
        for url in self.good:
            agent, url = self.get_agent_and_url(url)
            with self.subTest(url=url, agent=agent):
                self.assertTrue(self.parser.can_fetch(agent, url))

    def test_bad_urls(self):
        for url in self.bad:
            agent, url = self.get_agent_and_url(url)
            with self.subTest(url=url, agent=agent):
                self.assertFalse(self.parser.can_fetch(agent, url))

    def test_site_maps(self):
        self.assertEqual(self.parser.site_maps(), self.site_maps)

    def test_string_formatting(self):
        self.assertEqual(str(self.parser), self.expected_output)

class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
Disallow: /tmp/ # these will soon disappear
Disallow: /foo.html
    """
    good = ['/', '/test.html']
    bad = ['/cyberworld/map/index.html', '/tmp/xxx', '/foo.html']

class SimpleExampleTest(BaseRobotTest, unittest.TestCase):
    # Example from RFC 9309, section 5.1.
    robots_txt = """\
User-Agent: *
Disallow: *.gif$
Disallow: /example/
Allow: /publications/

User-Agent: foobot
Disallow:/
Allow:/example/page.html
Allow:/example/allowed.gif

User-Agent: barbot
User-Agent: bazbot
Disallow: /example/page.html

User-Agent: quxbot
    """
    good = [
        '/', '/publications/',
        ('foobot', '/example/page.html'), ('foobot', '/example/allowed.gif'),
        ('barbot', '/'), ('barbot', '/example/'),
            ('barbot', '/example/allowed.gif'),
            ('barbot', '/example/disallowed.gif'),
            ('barbot', '/publications/'),
            ('barbot', '/publications/allowed.gif'),
        ('bazbot', '/'), ('bazbot', '/example/'),
            ('bazbot', '/example/allowed.gif'),
            ('bazbot', '/example/disallowed.gif'),
            ('bazbot', '/publications/'),
            ('bazbot', '/publications/allowed.gif'),
        ('quxbot', '/'), ('quxbot', '/example/'),
            ('quxbot', '/example/page.html'), ('quxbot', '/example/allowed.gif'),
            ('quxbot', '/example/disallowed.gif'),
            ('quxbot', '/publications/'),
            ('quxbot', '/publications/allowed.gif'),
        ]
    bad = [
        '/example/', '/example/page.html', '/example/allowed.gif',
            '/example/disallowed.gif',
            '/publications/allowed.gif',
        ('foobot', '/'), ('foobot', '/example/'),
            ('foobot', '/example/disallowed.gif'),
            ('foobot', '/publications/'),
            ('foobot', '/publications/allowed.gif'),
        ('barbot', '/example/page.html'),
        ('bazbot', '/example/page.html'),
    ]


class CrawlDelayAndCustomAgentTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
# robots.txt for http://www.example.com/

User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space

# Cybermapper knows where to go.
User-agent: cybermapper
Disallow:
    """
    good = ['/', '/test.html', ('cybermapper', '/cyberworld/map/index.html')]
    bad = ['/cyberworld/map/index.html']


class SitemapTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
# robots.txt for http://www.example.com/

User-agent: *
Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
Sitemap: http://www.google.com/hostednews/sitemap_index.xml
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space

    """
    good = ['/', '/test.html']
    bad = ['/cyberworld/map/index.html']
    site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
                 'http://www.google.com/hostednews/sitemap_index.xml']


class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
# go away
User-agent: *
Disallow: /
    """
    good = ['/robots.txt']
    bad = ['/cyberworld/map/index.html', '/', '/tmp/']


class BaseRequestRateTest(BaseRobotTest):
    request_rate = None
    crawl_delay = None

    def test_request_rate(self):
        parser = self.parser
        for url in self.good + self.bad:
            agent, url = self.get_agent_and_url(url)
            with self.subTest(url=url, agent=agent):
                self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)

                parsed_request_rate = parser.request_rate(agent)
                self.assertEqual(parsed_request_rate, self.request_rate)
                if self.request_rate is not None:
                    self.assertIsInstance(
                        parsed_request_rate,
                        urllib.robotparser.RequestRate
                    )
                    self.assertEqual(
                        parsed_request_rate.requests,
                        self.request_rate.requests
                    )
                    self.assertEqual(
                        parsed_request_rate.seconds,
                        self.request_rate.seconds
                    )


class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
    robots_txt = ''
    good = ['/foo']
    expected_output = ''


class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
    robots_txt = """\
User-agent: figtree
Crawl-delay: 3
Request-rate: 9/30
Disallow: /tmp
Disallow: /a%3cd.html
Disallow: /a%2fb.html
Disallow: /%7ejoe/index.html
    """
    agent = 'figtree'
    request_rate = urllib.robotparser.RequestRate(9, 30)
    crawl_delay = 3
    good = [('figtree', '/foo.html')]
    bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
           '/a%2fb.html', '/~joe/index.html']


class DifferentAgentTest(CrawlDelayAndRequestRateTest):
    agent = 'FigTree Robot libwww-perl/5.04'


class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Disallow: /tmp/
Disallow: /a%3Cd.html
Disallow: /a/b.html
Disallow: /%7ejoe/index.html
Crawl-delay: 3
Request-rate: 9/banana
    """
    good = ['/tmp']
    bad = ['/tmp/', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a/b.html',
           '/%7Ejoe/index.html']
    crawl_delay = 3


class InvalidCrawlDelayTest(BaseRobotTest, unittest.TestCase):
    # From bug report #523041
    robots_txt = """\
User-Agent: *
Disallow: /.
Crawl-delay: pears
    """
    good = ['/foo.html']
    # bug report says "/" should be denied, but that is not in the RFC
    bad = []


class AnotherInvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
    # also test that Allow and Diasallow works well with each other
    robots_txt = """\
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
Request-rate: whale/banana
    """
    agent = 'Googlebot'
    good = ['/folder1/myfile.html']
    bad = ['/folder1/anotherfile.html']


class UserAgentOrderingTest(BaseRobotTest, unittest.TestCase):
    # the order of User-agent should not matter
    robots_txt = """\
User-agent: Googlebot
Disallow: /
Allow: /folder1/

User-agent: Googlebot-Mobile
Allow: /
Disallow: /folder1/
    """
    agent = 'Googlebot'
    bad = ['/something.jpg']
    good = ['/folder1/myfile.html']


class UserAgentGoogleMobileTest(UserAgentOrderingTest):
    agent = 'Googlebot-mobile'
    bad = ['/folder1/myfile.html']
    good = ['/something.jpg']


class LongestMatchTest(BaseRobotTest, unittest.TestCase):
    # Based on example from RFC 9309, section 5.2.
    robots_txt = """\
User-agent: *
Allow: /example/page/
Disallow: /example/page/disallowed.gif
Allow: /example/
    """
    good = ['/example/', '/example/page/']
    bad = ['/example/page/disallowed.gif']


class LongestMatchWildcardTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Allow: /example/page/
Disallow: *.gif
Allow: /example/
    """
    good = ['/example/', '/example/page/']
    bad = ['/example/page/disallowed.gif', '/x.gif']


class AllowWinsEqualMatchTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Disallow: /spam
Allow: /spam
Disallow: /spam
    """
    good = ['/spam', '/spam/']


class AllowWinsEqualFullMatchTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Disallow: /spam
Allow: /spam$
Disallow: /spam
Disallow: /eggs$
Allow: /eggs
Disallow: /eggs$
    """
    good = ['/spam', '/eggs', '/eggs/']
    bad = ['/spam/']


class AllowWinsEqualMatchWildcardTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Disallow: /spam
Allow: *am
Disallow: /spam
Disallow: *gs
Allow: /eggs
Disallow: *gs
    """
    good = ['/spam', '/eggs', '/spam/', '/eggs/']


class MergeGroupsTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: spambot
Disallow: /some/path

User-agent: spambot
Disallow: /another/path
    """
    agent = 'spambot'
    bad = ['/some/path', '/another/path']


class UserAgentStartsGroupTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: spambot
Disallow: /some/path
User-agent: eggsbot
Disallow: /another/path
    """
    good = [('spambot', '/'), ('spambot', '/another/path'),
            ('eggsbot', '/'), ('eggsbot', '/some/path')]
    bad = [('spambot', '/some/path'), ('eggsbot', '/another/path')]
    expected_output = """\
User-agent: spambot
Disallow: /some/path

User-agent: eggsbot
Disallow: /another/path\
"""

class IgnoreEmptyLinesTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: spambot

User-agent: eggsbot
Disallow: /some/path

Disallow: /another/path
    """
    good = [('spambot', '/'), ('eggsbot', '/')]
    bad = [
        ('spambot', '/some/path'), ('spambot', '/another/path'),
        ('eggsbot', '/some/path'), ('eggsbot', '/another/path'),
    ]
    expected_output = """\
User-agent: spambot
User-agent: eggsbot
Disallow: /some/path
Disallow: /another/path\
"""


class IgnoreRulesWithoutUserAgentTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
Disallow: /some/path

User-agent: *
Disallow: /another/path
    """
    good = ['/', '/some/path']
    bad = ['/another/path']
    expected_output = """\
User-agent: *
Disallow: /another/path\
"""


class EmptyGroupTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Disallow: /some/path

User-agent: spambot
    """
    agent = 'spambot'
    good = ['/', '/some/path']
    expected_output = """\
User-agent: *
Disallow: /some/path

User-agent: spambot
Allow:\
"""


class WeirdPathTest(BaseRobotTest, unittest.TestCase):
    robots_txt = f"""\
User-agent: *
Disallow: /a$$$
Disallow: /b$z
Disallow: /c***
Disallow: /d***z
Disallow: /e*$**$$
Disallow: /f*$**$$z
Disallow: /g$*$$**
Disallow: /h$*$$**z
    """
    good = ['/ax', '/a$$', '/b', '/bz', '/b$z', '/d', '/f', '/fz',
            '/f$$$z', '/fx$y$$z', '/gx', '/g$$$', '/g$x$$y', '/h', '/hz',
            '/h$$$z', '/h$x$$yz']
    bad = ['/a', '/c', '/cxy', '/dz', '/dxyz', '/dxzy', '/e', '/exy',
           '/e$$', '/ex$y$', '/g']
    expected_output = """\
User-agent: *
Disallow: /a$
Disallow: /c*
Disallow: /d*z
Disallow: /e*$
Disallow: /g$\
"""


class PathWithManyWildcardsTest(BaseRobotTest, unittest.TestCase):
    # This test would take many years if use naive translation to regular
    # expression (* -> .*).
    N = 50
    robots_txt = f"""\
User-agent: *
Disallow: /{'*a'*N}*b
    """
    good = ['/' + 'a'*N + 'a']
    bad = ['/' + 'a'*N + 'b']


class DisallowQueryStringTest(BaseRobotTest, unittest.TestCase):
    # see issue #6325 for details
    robots_txt = """\
User-agent: *
Disallow: /some/path?name=value
Disallow: /another/path?
Disallow: /yet/one/path?name=value&more
    """
    good = ['/some/path', '/some/path?',
            '/some/path%3Fname=value', '/some/path?name%3Dvalue',
            '/another/path', '/another/path%3F',
            '/yet/one/path?name=value%26more',
            '/some/pathxname=value']
    bad = ['/some/path?name=value'
           '/another/path?', '/another/path?name=value',
           '/yet/one/path?name=value&more']


class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Disallow: /a1/Z-._~ # unreserved characters
Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
Disallow: /u2/%f0%9f%90%8d
Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
Disallow: /v1/%F0 # percent-encoded non-ASCII octet
Disallow: /v2/%f0
Disallow: /v3/\udcf0 # raw non-ASCII octet
Disallow: /p1%xy # raw percent
Disallow: /p2%
Disallow: /p3%25xy # percent-encoded percent
Disallow: /p4%2525xy # double percent-encoded percent
Disallow: /john%20smith # space
Disallow: /john doe
Disallow: /trailingspace%20
Disallow: /question%3Fq=v # not query
Disallow: /hash%23f # not fragment
Disallow: /dollar%24
Disallow: /asterisk%2A
Disallow: /sub/dir
Disallow: /slash%2F
Disallow: /query/question?q=%3F
Disallow: /query/raw/question?q=?
Disallow: /query/eq?q%3Dv
Disallow: /query/amp?q=v%26a
"""
    good = [
        '/u1/%F0', '/u1/%f0',
        '/u2/%F0', '/u2/%f0',
        '/u3/%F0', '/u3/%f0',
        '/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
        '/question?q=v',
        '/dollar', '/asterisk',
        '/query/eq?q=v',
        '/query/amp?q=v&a',
    ]
    bad = [
        '/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
        '/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
        '/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
        '/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
        '/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
        '/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
        '/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
        '/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
        '/p1%xy', '/p1%25xy',
        '/p2%', '/p2%25', '/p2%2525', '/p2%xy',
        '/p3%xy', '/p3%25xy',
        '/p4%2525xy',
        '/john%20smith', '/john smith',
        '/john%20doe', '/john doe',
        '/trailingspace%20', '/trailingspace ',
        '/question%3Fq=v',
        '/hash#f', '/hash%23f',
        '/dollar$', '/dollar%24',
        '/asterisk*', '/asterisk%2A',
        '/sub/dir', '/sub%2Fdir',
        '/slash%2F', '/slash/',
        '/query/question?q=?', '/query/question?q=%3F',
        '/query/raw/question?q=?', '/query/raw/question?q=%3F',
        '/query/eq?q%3Dv',
        '/query/amp?q=v%26a',
    ]
    # other reserved characters
    for c in ":/#[]@!$&'()*+,;=":
        robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
        bad.append(f'/raw{c}')
        bad.append(f'/raw%{ord(c):02X}')
        bad.append(f'/pc{c}')
        bad.append(f'/pc%{ord(c):02X}')


class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/
    """
    request_rate = urllib.robotparser.RequestRate(3, 15)
    crawl_delay = 1
    good = ['/', '/test.html']
    bad = ['/cyberworld/map/index.html']


class StringFormattingTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/ # This is an infinite virtual URL space

# Cybermapper knows where to go.
User-agent: cybermapper
Disallow: /some/path
    """

    expected_output = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/

User-agent: cybermapper
Disallow: /some/path\
"""


class ConstructedStringFormattingTest(unittest.TestCase):
    def test_empty(self):
        parser = urllib.robotparser.RobotFileParser()
        self.assertEqual(str(parser), '')

    def test_group_without_rules(self):
        parser = urllib.robotparser.RobotFileParser()
        entry = urllib.robotparser.Entry()
        entry.useragents = ['spambot']
        parser._add_entry(entry)
        entry = urllib.robotparser.Entry()
        entry.useragents = ['hambot']
        entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
        parser._add_entry(entry)
        entry = urllib.robotparser.Entry()
        entry.useragents = ['eggsbot']
        parser._add_entry(entry)
        self.assertEqual(str(parser), """\
User-agent: spambot
Allow:

User-agent: hambot
Disallow: /ham

User-agent: eggsbot
Allow:\
""")

    def test_group_without_user_agent(self):
        parser = urllib.robotparser.RobotFileParser()
        entry = urllib.robotparser.Entry()
        entry.rulelines = [urllib.robotparser.RuleLine('/ham', False)]
        parser._add_entry(entry)
        entry = urllib.robotparser.Entry()
        entry.useragents = ['spambot']
        entry.rulelines = [urllib.robotparser.RuleLine('/spam', False)]
        parser._add_entry(entry)
        entry = urllib.robotparser.Entry()
        entry.rulelines = [urllib.robotparser.RuleLine('/eggs', False)]
        parser._add_entry(entry)
        self.assertEqual(str(parser), """\
User-agent: spambot
Disallow: /spam\
""")


@unittest.skipUnless(
    support.has_socket_support,
    "Socket server requires working socket."
)
class BaseLocalNetworkTestCase:

    def setUp(self):
        # clear _opener global variable
        self.addCleanup(urllib.request.urlcleanup)

        self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)

        self.t = threading.Thread(
            name='HTTPServer serving',
            target=self.server.serve_forever,
            # Short poll interval to make the test finish quickly.
            # Time between requests is short enough that we won't wake
            # up spuriously too many times.
            kwargs={'poll_interval':0.01})
        self.t.daemon = True  # In case this function raises.
        self.t.start()

    def tearDown(self):
        self.server.shutdown()
        self.t.join()
        self.server.server_close()


SAMPLE_ROBOTS_TXT = b'''\
User-agent: test_robotparser
Disallow: /utf8/\xf0\x9f\x90\x8d
Disallow: /non-utf8/\xf0
Disallow: //[spam]/path
'''


class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
    class RobotHandler(BaseHTTPRequestHandler):

        def do_GET(self):
            self.send_response(200)
            self.end_headers()
            self.wfile.write(SAMPLE_ROBOTS_TXT)

        def log_message(self, format, *args):
            pass

    @threading_helper.reap_threads
    def testRead(self):
        # Test that reading a weird robots.txt doesn't fail.
        addr = self.server.server_address
        url = f'http://{socket_helper.HOST}:{addr[1]}'
        robots_url = url + '/robots.txt'
        parser = urllib.robotparser.RobotFileParser()
        parser.set_url(robots_url)
        parser.read()
        # And it can even interpret the weird paths in some reasonable way.
        agent = 'test_robotparser'
        self.assertTrue(parser.can_fetch(agent, robots_url))
        self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
        self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
        self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
        self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
        self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
        self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
        self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
        self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))


class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
    class RobotHandler(BaseHTTPRequestHandler):

        def do_GET(self):
            self.send_error(403, "Forbidden access")

        def log_message(self, format, *args):
            pass

    @threading_helper.reap_threads
    def testPasswordProtectedSite(self):
        addr = self.server.server_address
        url = 'http://' + socket_helper.HOST + ':' + str(addr[1])
        robots_url = url + "/robots.txt"
        parser = urllib.robotparser.RobotFileParser()
        parser.set_url(url)
        parser.read()
        self.assertFalse(parser.can_fetch("*", robots_url))


@support.requires_working_socket()
class NetworkTestCase(unittest.TestCase):

    base_url = 'http://www.pythontest.net/'
    robots_txt = '{}elsewhere/robots.txt'.format(base_url)

    @classmethod
    def setUpClass(cls):
        support.requires('network')
        with socket_helper.transient_internet(cls.base_url):
            cls.parser = urllib.robotparser.RobotFileParser(cls.robots_txt)
            cls.parser.read()

    def url(self, path):
        return '{}{}{}'.format(
            self.base_url, path, '/' if not os.path.splitext(path)[1] else ''
        )

    def test_basic(self):
        self.assertFalse(self.parser.disallow_all)
        self.assertFalse(self.parser.allow_all)
        self.assertGreater(self.parser.mtime(), 0)
        self.assertFalse(self.parser.crawl_delay('*'))
        self.assertFalse(self.parser.request_rate('*'))

    def test_can_fetch(self):
        self.assertTrue(self.parser.can_fetch('*', self.url('elsewhere')))
        self.assertFalse(self.parser.can_fetch('Nutch', self.base_url))
        self.assertTrue(self.parser.can_fetch('Nutch', self.url('brian')))
        self.assertFalse(self.parser.can_fetch('Nutch', self.url('webstats')))
        self.assertFalse(self.parser.can_fetch('*', self.url('webstats')))
        self.assertTrue(self.parser.can_fetch('*', self.base_url))

    def test_read_404(self):
        parser = urllib.robotparser.RobotFileParser(self.url('i-robot.txt'))
        parser.read()
        self.assertTrue(parser.allow_all)
        self.assertFalse(parser.disallow_all)
        self.assertEqual(parser.mtime(), 0)
        self.assertIsNone(parser.crawl_delay('*'))
        self.assertIsNone(parser.request_rate('*'))

if __name__=='__main__':
    unittest.main()