`apetest.spider` module

Keeps track of links between pages.

Use spider_req() to create a Spider, then iterate through it to receive new requests to check and call Spider.add_requests() to add links you found while checking.

At any point during or after the crawling, Spider.iter_referring_requests() can be used to ask which other requests linked to a given request.

Source code

# SPDX-License-Identifier: BSD-3-Clause

"""Keeps track of links between pages.

Use `spider_req` to create a `Spider`, then iterate through it to receive
new requests to check and call `Spider.add_requests` to add links you found
while checking.

At any point during or after the crawling, `Spider.iter_referring_requests`
can be used to ask which other requests linked to a given request.
"""

from collections import defaultdict
from urllib.parse import urljoin, urlsplit

from apetest.fetch import USER_AGENT_PREFIX, load_text
from apetest.robots import (
    lookup_robots_rules, parse_robots_txt, path_allowed, scan_robots_txt
    )

class Spider:
    """Web crawler that remembers which requests have been discovered,
    which have been checked and the links between them.

    Instances of this class are iterable. Every request yielded is
    automatically marked as visited. It is valid to add new requests
    while iterating.
    """

    max_queries_per_page = 100
    """Maximum number of queries to generate with the same path.

    For pages with many arguments, the number of possible queries can
    become so large that it not feasible to check them all.
    """
    # TODO: Currently just the first 100 are checked, it would be better
    #       to try variations of all query arguments.

    def __init__(self, first_req, rules):
        """Initializes a spider that starts at `first_req` and follows
        the given exclusion rules.

        In most cases, you should use `spider_req` instead.
        """
        self._base_url = first_req.page_url
        self._rules = rules
        self._requests_to_check = set([first_req])
        self._requests_checked = set()
        self._queries_per_page = defaultdict(int)
        # Maps source request to referrers (destination).
        self._site_graph = {}
        # Maps destination page to source requests.
        self._page_referred_from = defaultdict(set)

    def __iter__(self):
        checked = self._requests_checked
        to_check = self._requests_to_check
        while to_check:
            print('checked: %d, to check: %d' % (len(checked), len(to_check)))
            request = min(to_check)
            to_check.remove(request)
            checked.add(request)
            yield request

    def referrer_allowed(self, referrer):
        """Returns `True` iff this spider is allowed to visit the resources
        referenced by `referrer`.
        """
        # TODO: Currently the 'checker' module rejects out-of-scope URLs,
        #       but it would be cleaner to do that at the spider level,
        #       in case we ever want to support crawling multiple roots
        #       or want to report all external links.
        path = urlsplit(referrer.page_url).path or '/'
        base_url = self._base_url
        if base_url.startswith('file:'):
            base_path = urlsplit(base_url).path or '/'
            if not path.startswith(base_path):
                # Path is outside the tree rooted at our base URL.
                return False
            path = path[base_path.rindex('/'):]

        return path_allowed(path, self._rules)

    def add_requests(self, source_req, referrers):
        """Adds the requests from `referrers`, which were discovered
        in `source_req`.

        Added requests that were not discovered before are registered
        as to be checked. The spider also remembers that `source_req`
        links to the added requests.
        """

        # Filter referrers according to rules.
        allowed_referrers = [
            referrer
            for referrer in referrers
            if self.referrer_allowed(referrer)
            ]

        # Currently each request is only visited once, so we do not have to
        # merge data, but that might change once we start doing POSTs.
        assert source_req not in self._site_graph
        self._site_graph[source_req] = allowed_referrers

        for referrer in allowed_referrers:
            url = referrer.page_url
            self._page_referred_from[url].add(source_req)

            for request in referrer.iter_requests():
                if request in self._requests_checked \
                or request in self._requests_to_check:
                    continue
                if self._queries_per_page[url] >= self.max_queries_per_page:
                    print('maximum number of queries reached for "%s"' % url)
                    break
                self._queries_per_page[url] += 1
                self._requests_to_check.add(request)

    def iter_referring_requests(self, dest_req):
        """Iterates through the requests that refer to the given request.
        """
        for source_req in self._page_referred_from[dest_req.page_url]:
            for referrer in self._site_graph[source_req]:
                if referrer.has_request(dest_req):
                    yield source_req

def spider_req(first_req):
    """Creates a `Spider` that starts at the given `apetest.request.Request`.

    This function will attempt to read `robots.txt` from the server
    or base directory contained in `first_req`. Any rules found there
    that apply to APE will be passed on to the new `Spider`.
    """
    base_url = first_req.page_url
    if base_url.startswith('file:'):
        robots_url = urljoin(base_url, 'robots.txt')
    else:
        robots_url = urljoin(base_url, '/robots.txt')

    print('fetching "robots.txt"...')
    report, response, robots_lines = load_text(robots_url)
    if robots_lines is None:
        if response is not None and response.code == 404:
            # It is not an error if "robots.txt" does not exist.
            print('no "robots.txt" was found')
            report = None
        rules = []
    else:
        robots_records = scan_robots_txt(robots_lines, report)
        rules_map = parse_robots_txt(robots_records, report)
        rules = lookup_robots_rules(rules_map, USER_AGENT_PREFIX)
        report.checked = True

    return Spider(first_req, rules), report}

Functions

def spider_req(first_req)

Creates a Spider that starts at the given Request.

This function will attempt to read robots.txt from the server or base directory contained in first_req. Any rules found there that apply to APE will be passed on to the new Spider.

Source code

def spider_req(first_req):
    """Creates a `Spider` that starts at the given `apetest.request.Request`.

    This function will attempt to read `robots.txt` from the server
    or base directory contained in `first_req`. Any rules found there
    that apply to APE will be passed on to the new `Spider`.
    """
    base_url = first_req.page_url
    if base_url.startswith('file:'):
        robots_url = urljoin(base_url, 'robots.txt')
    else:
        robots_url = urljoin(base_url, '/robots.txt')

    print('fetching "robots.txt"...')
    report, response, robots_lines = load_text(robots_url)
    if robots_lines is None:
        if response is not None and response.code == 404:
            # It is not an error if "robots.txt" does not exist.
            print('no "robots.txt" was found')
            report = None
        rules = []
    else:
        robots_records = scan_robots_txt(robots_lines, report)
        rules_map = parse_robots_txt(robots_records, report)
        rules = lookup_robots_rules(rules_map, USER_AGENT_PREFIX)
        report.checked = True

    return Spider(first_req, rules), report}

Classes

class Spider

Web crawler that remembers which requests have been discovered, which have been checked and the links between them.

Instances of this class are iterable. Every request yielded is automatically marked as visited. It is valid to add new requests while iterating.

Source code

class Spider:
    """Web crawler that remembers which requests have been discovered,
    which have been checked and the links between them.

    Instances of this class are iterable. Every request yielded is
    automatically marked as visited. It is valid to add new requests
    while iterating.
    """

    max_queries_per_page = 100
    """Maximum number of queries to generate with the same path.

    For pages with many arguments, the number of possible queries can
    become so large that it not feasible to check them all.
    """
    # TODO: Currently just the first 100 are checked, it would be better
    #       to try variations of all query arguments.

    def __init__(self, first_req, rules):
        """Initializes a spider that starts at `first_req` and follows
        the given exclusion rules.

        In most cases, you should use `spider_req` instead.
        """
        self._base_url = first_req.page_url
        self._rules = rules
        self._requests_to_check = set([first_req])
        self._requests_checked = set()
        self._queries_per_page = defaultdict(int)
        # Maps source request to referrers (destination).
        self._site_graph = {}
        # Maps destination page to source requests.
        self._page_referred_from = defaultdict(set)

    def __iter__(self):
        checked = self._requests_checked
        to_check = self._requests_to_check
        while to_check:
            print('checked: %d, to check: %d' % (len(checked), len(to_check)))
            request = min(to_check)
            to_check.remove(request)
            checked.add(request)
            yield request

    def referrer_allowed(self, referrer):
        """Returns `True` iff this spider is allowed to visit the resources
        referenced by `referrer`.
        """
        # TODO: Currently the 'checker' module rejects out-of-scope URLs,
        #       but it would be cleaner to do that at the spider level,
        #       in case we ever want to support crawling multiple roots
        #       or want to report all external links.
        path = urlsplit(referrer.page_url).path or '/'
        base_url = self._base_url
        if base_url.startswith('file:'):
            base_path = urlsplit(base_url).path or '/'
            if not path.startswith(base_path):
                # Path is outside the tree rooted at our base URL.
                return False
            path = path[base_path.rindex('/'):]

        return path_allowed(path, self._rules)

    def add_requests(self, source_req, referrers):
        """Adds the requests from `referrers`, which were discovered
        in `source_req`.

        Added requests that were not discovered before are registered
        as to be checked. The spider also remembers that `source_req`
        links to the added requests.
        """

        # Filter referrers according to rules.
        allowed_referrers = [
            referrer
            for referrer in referrers
            if self.referrer_allowed(referrer)
            ]

        # Currently each request is only visited once, so we do not have to
        # merge data, but that might change once we start doing POSTs.
        assert source_req not in self._site_graph
        self._site_graph[source_req] = allowed_referrers

        for referrer in allowed_referrers:
            url = referrer.page_url
            self._page_referred_from[url].add(source_req)

            for request in referrer.iter_requests():
                if request in self._requests_checked \
                or request in self._requests_to_check:
                    continue
                if self._queries_per_page[url] >= self.max_queries_per_page:
                    print('maximum number of queries reached for "%s"' % url)
                    break
                self._queries_per_page[url] += 1
                self._requests_to_check.add(request)

    def iter_referring_requests(self, dest_req):
        """Iterates through the requests that refer to the given request.
        """
        for source_req in self._page_referred_from[dest_req.page_url]:
            for referrer in self._site_graph[source_req]:
                if referrer.has_request(dest_req):
                    yield source_req}

Class variables

var max_queries_per_page: Maximum number of queries to generate with the same path.

For pages with many arguments, the number of possible queries can become so large that it not feasible to check them all.

Methods

def __init__(self, first_req, rules)

Initializes a spider that starts at first_req and follows the given exclusion rules.

In most cases, you should use spider_req() instead.

Source code

def __init__(self, first_req, rules):
    """Initializes a spider that starts at `first_req` and follows
    the given exclusion rules.

    In most cases, you should use `spider_req` instead.
    """
    self._base_url = first_req.page_url
    self._rules = rules
    self._requests_to_check = set([first_req])
    self._requests_checked = set()
    self._queries_per_page = defaultdict(int)
    # Maps source request to referrers (destination).
    self._site_graph = {}
    # Maps destination page to source requests.
    self._page_referred_from = defaultdict(set)}

def add_requests(self, source_req, referrers)

Adds the requests from referrers, which were discovered in source_req.

Added requests that were not discovered before are registered as to be checked. The spider also remembers that source_req links to the added requests.

Source code

def add_requests(self, source_req, referrers):
    """Adds the requests from `referrers`, which were discovered
    in `source_req`.

    Added requests that were not discovered before are registered
    as to be checked. The spider also remembers that `source_req`
    links to the added requests.
    """

    # Filter referrers according to rules.
    allowed_referrers = [
        referrer
        for referrer in referrers
        if self.referrer_allowed(referrer)
        ]

    # Currently each request is only visited once, so we do not have to
    # merge data, but that might change once we start doing POSTs.
    assert source_req not in self._site_graph
    self._site_graph[source_req] = allowed_referrers

    for referrer in allowed_referrers:
        url = referrer.page_url
        self._page_referred_from[url].add(source_req)

        for request in referrer.iter_requests():
            if request in self._requests_checked \
            or request in self._requests_to_check:
                continue
            if self._queries_per_page[url] >= self.max_queries_per_page:
                print('maximum number of queries reached for "%s"' % url)
                break
            self._queries_per_page[url] += 1
            self._requests_to_check.add(request)}

def iter_referring_requests(self, dest_req)

Iterates through the requests that refer to the given request.

Source code

def iter_referring_requests(self, dest_req):
    """Iterates through the requests that refer to the given request.
    """
    for source_req in self._page_referred_from[dest_req.page_url]:
        for referrer in self._site_graph[source_req]:
            if referrer.has_request(dest_req):
                yield source_req}

def referrer_allowed(self, referrer)

Returns True iff this spider is allowed to visit the resources referenced by referrer.

Source code

def referrer_allowed(self, referrer):
    """Returns `True` iff this spider is allowed to visit the resources
    referenced by `referrer`.
    """
    # TODO: Currently the 'checker' module rejects out-of-scope URLs,
    #       but it would be cleaner to do that at the spider level,
    #       in case we ever want to support crawling multiple roots
    #       or want to report all external links.
    path = urlsplit(referrer.page_url).path or '/'
    base_url = self._base_url
    if base_url.startswith('file:'):
        base_path = urlsplit(base_url).path or '/'
        if not path.startswith(base_path):
            # Path is outside the tree rooted at our base URL.
            return False
        path = path[base_path.rindex('/'):]

    return path_allowed(path, self._rules)}