`apetest.checker` module

Checks a document for problems and finds links to other documents.

The PageChecker class is where the work is done.

Source code

# SPDX-License-Identifier: BSD-3-Clause

"""Checks a document for problems and finds links to other documents.

The `PageChecker` class is where the work is done.
"""

from collections import defaultdict
from enum import Enum
from logging import getLogger
import re
from urllib.parse import urljoin, urlsplit, urlunsplit

from lxml import etree

from apetest.control import (
    Checkbox, FileInput, HiddenInput, RadioButton, RadioButtonGroup,
    SelectSingle, SelectMultiple, SubmitButton, SubmitButtons,
    TextArea, TextField
    )
from apetest.fetch import decode_and_report, encoding_from_bom, load_page
from apetest.referrer import Form, LinkSet, Redirect
from apetest.request import Request

class Accept(Enum):
    """The types of documents that we tell the server we accept."""

    ANY = 1
    """Accept both HTML and XHTML."""

    HTML = 2
    """Accept only HTML."""

_LOG = getLogger(__name__)

_RE_XML_DECL = re.compile(
    r'<\?xml([ \t\r\n\'"\w.\-=]*).*\?>'
    )
_RE_XML_DECL_ATTR = re.compile(
    r'[ \t\r\n]+([a-z]+)[ \t\r\n]*=[ \t\r\n]*'
    r'(?P<quote>[\'"])([\w.\-]*)(?P=quote)'
    )

def strip_xml_decl(text):
    """Strip the XML declaration from the start of the given text.

    Returns the given text without XML declaration, or the unmodified text if
    no XML declaration was found.
    """
    match = _RE_XML_DECL.match(text)
    return text if match is None else text[match.end():]

def encoding_from_xml_decl(text):
    """Look for an XML declaration with an `encoding` attribute at the start
    of the given text.

    Returns:

    encoding
        The attribute value, converted to lower case.
    None
        If no attribute was found.
    """

    match = _RE_XML_DECL.match(text)
    if match is not None:
        decl = match.group(1)
        for match in _RE_XML_DECL_ATTR.finditer(decl):
            name, quote_, value = match.groups()
            if name == 'encoding':
                return value.lower()
    return None

def normalize_url(url):
    """Returns a unique string for the given URL.

    This is required in some places, since different libraries
    have different opinions whether local URLs should start with
    `file:/` or `file:///`.
    """
    return urlunsplit(urlsplit(url))

def parse_document(content, is_xml, report):
    """Parse the given XML or HTML document.

    Parameters:

    content
        Text to be parsed.
    is_xlm
        If `True`, parse as XML, otherwise parse as HTML.
    report: apetest.report.Report
        Parse errors are logged here.

    Returns:

    tree
        A document `etree`.
    None
        If the document is too broken to be parsed.
    """
    parser_factory = etree.XMLParser if is_xml else etree.HTMLParser
    parser = parser_factory(recover=True)

    if is_xml:
        # The lxml parser does not accept encoding in XML declarations
        # when parsing strings.
        content = strip_xml_decl(content)
    try:
        root = etree.fromstring(content, parser)
    except etree.XMLSyntaxError:
        report.error(
            'Failed to parse document as %s; '
            'cannot gather references to other documents.',
            'XML' if is_xml else 'HTML'
            )
        return None

    # The lxml HTML parser is an HTML4 parser. HTML5 is similar enough
    # that it will still be able to produce a document tree, but it will
    # report errors on for example inline SVG.
    if is_xml:
        for error in parser.error_log:
            if hasattr(error, 'line'):
                line = error.line
            elif hasattr(error, 'position'):
                line = error.position[0]
            else:
                line = None

            message = error.message
            if line is not None:
                message += ' (line %d)' % line

            report.error(message)

    return None if root is None else root.getroottree()

def _parse_input_control(attrib):
    _LOG.debug('input: %s', attrib)
    disabled = 'disabled' in attrib
    if disabled:
        return None
    # TODO: Support readonly controls?
    name = attrib.get('name')
    ctype = attrib.get('type')
    value = attrib.get('value')
    if ctype in ('text', 'password'):
        return TextField(name, value)
    elif ctype == 'checkbox':
        return Checkbox(name, value)
    elif ctype == 'radio':
        return RadioButton(name, value)
    elif ctype == 'file':
        return FileInput(name, value)
    elif ctype == 'hidden':
        return HiddenInput(name, value)
    elif ctype in ('submit', 'image'):
        return SubmitButton(name, value)
    elif ctype in ('button', 'reset'):
        # Type "button" is used by JavaScript, "reset" by the browser.
        return None
    else:
        # Invalid control type, will already be flagged by the DTD.
        return None

class PageChecker:
    """Retrieves a page, checks its contents and finds references
    to other pages.
    """

    def __init__(self, base_url, accept, scribe, plugins):
        """Initialize page checker.

        Parameters:

        base_url
            Base URL for the web site or app under test.
        accept: Accept
            The types of documents that we tell the server we accept.
        scribe: apetest.report.Scribe
            Reports will be added here.
        plugins: apetest.plugin.PluginCollection
            Plugins to notify of loaded documents.
        """

        self.base_url = normalize_url(base_url)
        self.accept = accept
        self.scribe = scribe
        self.plugins = plugins

    def short_url(self, url):
        """Return a shortened version of `url`.

        This drops the part of the URL that all pages share.
        """

        assert url.startswith(self.base_url), url
        return url[self.base_url.rindex('/') + 1 : ]

    def check(self, req):
        """Check a single `apetest.request.Request`."""

        req_url = str(req)
        _LOG.info('Checking page: %s', self.short_url(req_url))

        accept = self.accept
        accept_header = {
            # Prefer XHTML to HTML because it is stricter.
            Accept.ANY: 'text/html; q=0.8, application/xhtml+xml; q=1.0',
            Accept.HTML: 'text/html; q=1.0'
            }[accept]

        report, response, content_bytes = load_page(
            req_url, req.maybe_bad, accept_header
            )
        referrers = []

        if response is not None and response.code is not None \
                and 300 <= response.code < 400:
            content_url = normalize_url(response.url)
            if content_url != req_url:
                if content_url.startswith(self.base_url):
                    if not content_url.startswith('file:'):
                        report.info(
                            'Redirected to: %s', self.short_url(content_url)
                            )
                    try:
                        referrers.append(
                            Redirect(Request.from_url(content_url))
                            )
                    except ValueError as ex:
                        report.warning('%s', ex)
                else:
                    report.info('Redirected outside: %s', content_url)

        if content_bytes is None:
            report.info('Could not get any content to check')
            skip_content = True
        elif response.code in (200, None):
            skip_content = False
        else:
            # TODO: This should probably be user-selectable.
            #       A lot of web servers produce error and redirection
            #       notices that are not HTML5 compliant. Checking the
            #       content is likely only useful if the application
            #       under test is producing the content instead.
            report.info(
                'Skipping content check because of HTTP status %d',
                response.code
                )
            skip_content = True

        if skip_content:
            report.checked = True
            self.scribe.add_report(report)
            return referrers

        headers = response.headers
        content_type_header = headers['Content-Type']
        if content_type_header is None:
            message = 'Missing Content-Type header'
            _LOG.error(message)
            report.error(message)
            self.scribe.add_report(report)
            return referrers

        content_type = headers.get_content_type()
        is_html = content_type in ('text/html', 'application/xhtml+xml')
        is_xml = content_type.endswith('/xml') or content_type.endswith('+xml')
        http_encoding = headers.get_content_charset()

        # Speculatively decode the first 1024 bytes, so we can look inside
        # the document for encoding clues.
        bom_encoding = encoding_from_bom(content_bytes)
        content_head = content_bytes[:1024].decode(
            bom_encoding or 'ascii', 'replace'
            )

        if not is_xml and content_head.startswith('<?xml'):
            is_xml = True
            if req_url.startswith('file:'):
                # Silently correct content-type detection for local files.
                # This is not something the user can easily fix, so issuing
                # a warning would not be helpful.
                if content_type == 'text/html':
                    content_type = 'application/xhtml+xml'
            else:
                report.warning(
                    'Document is served with content type "%s" '
                    'but starts with an XML declaration',
                    content_type
                    )

        if not is_xml and not content_type.startswith('text/'):
            self.plugins.resource_loaded(
                content_bytes, content_type_header, report
                )
            message = 'Document of type "%s" is probably not text; ' \
                'skipping.' % content_type
            _LOG.info(message)
            report.info(message)
            report.checked = True # not really, but we just logged why not
            self.scribe.add_report(report)
            return referrers

        if is_html and is_xml and accept is Accept.HTML:
            report.warning(
                'HTML document is serialized as XML, while the HTTP Accept '
                'header did not include "application/xhtml+xml"'
                )

        # Look for encoding in XML declaration.
        decl_encoding = encoding_from_xml_decl(content_head)

        # TODO: Also look at HTML <meta> tags.

        # Try possible encodings in order of precedence.
        # W3C recommends giving the BOM, if present, precedence over HTTP.
        #   http://www.w3.org/International/questions/qa-byte-order-mark
        try:
            content, used_encoding = decode_and_report(
                content_bytes,
                ((bom_encoding, 'Byte Order Mark'),
                 (decl_encoding, 'XML declaration'),
                 (http_encoding, 'HTTP header')),
                report
                )
        except UnicodeDecodeError as ex:
            # All likely encodings failed.
            report.error('Failed to decode contents')
            self.scribe.add_report(report)
            return referrers

        if req_url.startswith('file:'):
            # Construct a new header that is likely more accurate.
            content_type_header = '%s; charset=%s' % (
                content_type, used_encoding
                )
        self.plugins.resource_loaded(content_bytes, content_type_header, report)

        if is_html or is_xml:
            tree = parse_document(content, is_xml, report)
            if tree is not None:
                # Find links to other documents.
                referrers += self.find_referrers_in_xml(tree, req_url, report)
                if is_html:
                    referrers += self.find_referrers_in_html(tree, req_url)

        self.scribe.add_report(report)
        return referrers

    _htmlLinkElements = {
        'a': 'href',
        'link': 'href',
        'img': 'src',
        'script': 'src',
        }
    _xmlLinkElements = {
        '{http://www.w3.org/1999/xhtml}' + tag_name: attr_name
        for tag_name, attr_name in _htmlLinkElements.items()
        }
    # SVG 1.1 uses XLink, but SVG 2 has native 'href' attributes.
    # We're only interested in elements that can link to external
    # resources, not all elements that support 'href'.
    _xmlLinkElements.update({
        '{http://www.w3.org/2000/svg}' + tag_name: 'href'
        for tag_name in ('a', 'image', 'script')
        })
    _xmlLinkElements.update({
        '{http://www.w3.org/2005/Atom}link': 'href'
        })
    # Insert HTML elements without namespace for HTML trees and
    # with namespace for XHTML trees.
    _linkElements = dict(_htmlLinkElements)
    _linkElements.update(_xmlLinkElements)

    def find_urls(self, tree):
        """Yield URLs found in the document `tree`.
        """
        get_attr_name = self._linkElements.__getitem__
        for node in tree.getroot().iter():
            try:
                yield node.attrib[get_attr_name(node.tag)]
            except KeyError:
                pass
            try:
                yield node.attrib['{http://www.w3.org/1999/xlink}href']
            except KeyError:
                pass

    def find_referrers_in_xml(self, tree, tree_url, report):
        """Yield `apetest.referrer.Referrer` objects for links found
        in XML tags in the document `tree`.
        """
        links = defaultdict(LinkSet)
        for url in self.find_urls(tree):
            _LOG.debug(' Found URL: %s', url)
            if url.startswith('?'):
                url = urlsplit(tree_url).path + url
            url = urljoin(tree_url, url)
            if url.startswith(self.base_url):
                try:
                    request = Request.from_url(url)
                except ValueError as ex:
                    report.warning('%s', ex)
                else:
                    links[request.page_url].add(request)
        yield from links.values()

    def find_referrers_in_html(self, tree, url):
        """Yield `apetest.referrer.Referrer` objects for links and forms
        found in HTML tags in the document `tree`.
        """
        root = tree.getroot()
        ns_prefix = '{%s}' % root.nsmap[None] if None in root.nsmap else ''

        for form_node in root.getiterator(ns_prefix + 'form'):
            # TODO: How to handle an empty action?
            #       1. take current path, erase query (current impl)
            #       2. take current path, merge query
            #       3. flag as error (not clearly specced)
            #       I think either flag as error, or mimic the browsers.
            try:
                action = form_node.attrib['action'] or urlsplit(url).path
                method = form_node.attrib['method'].lower()
            except KeyError:
                continue
            if method == 'post':
                # TODO: Support POST (with flag to enable/disable).
                continue
            if method != 'get':
                # The DTD will already have flagged this as a violation.
                continue
            submit_url = urljoin(url, action)
            if not submit_url.startswith(self.base_url):
                continue

            # Note: Disabled controls should not be submitted, so we pretend
            #       they do not even exist.
            controls = []
            radio_buttons = defaultdict(list)
            submit_buttons = []
            for inp in form_node.getiterator(ns_prefix + 'input'):
                control = _parse_input_control(inp.attrib)
                if control is None:
                    pass
                elif isinstance(control, RadioButton):
                    radio_buttons[control.name].append(control)
                elif isinstance(control, SubmitButton):
                    submit_buttons.append(control)
                else:
                    controls.append(control)
            for control in form_node.getiterator(ns_prefix + 'select'):
                name = control.attrib.get('name')
                multiple = control.attrib.get('multiple')
                disabled = 'disabled' in control.attrib
                if disabled:
                    continue
                options = [
                    option.attrib.get('value', option.text)
                    for option in control.getiterator(ns_prefix + 'option')
                    ]
                if multiple:
                    for option in options:
                        controls.append(SelectMultiple(name, option))
                else:
                    controls.append(SelectSingle(name, options))
            for control in form_node.getiterator(ns_prefix + 'textarea'):
                name = control.attrib.get('name')
                value = control.text
                disabled = 'disabled' in control.attrib
                if disabled:
                    continue
                _LOG.debug('textarea "%s": %s', name, value)
                controls.append(TextArea(name, value))

            # Merge exclusive controls.
            for buttons in radio_buttons.values():
                controls.append(RadioButtonGroup(buttons))
            if submit_buttons:
                controls.append(SubmitButtons(submit_buttons))
            # If the form contains no submit buttons, assume it can be
            # submitted using JavaScript, so continue.

            yield Form(submit_url, method, controls)}

Functions

def encoding_from_xml_decl(text)

Look for an XML declaration with an encoding attribute at the start of the given text.

Returns

encoding: The attribute value, converted to lower case.
None: If no attribute was found.

Source code

def encoding_from_xml_decl(text):
    """Look for an XML declaration with an `encoding` attribute at the start
    of the given text.

    Returns:

    encoding
        The attribute value, converted to lower case.
    None
        If no attribute was found.
    """

    match = _RE_XML_DECL.match(text)
    if match is not None:
        decl = match.group(1)
        for match in _RE_XML_DECL_ATTR.finditer(decl):
            name, quote_, value = match.groups()
            if name == 'encoding':
                return value.lower()
    return None}

def normalize_url(url)

Returns a unique string for the given URL.

This is required in some places, since different libraries have different opinions whether local URLs should start with file:/ or file:///.

Source code

def normalize_url(url):
    """Returns a unique string for the given URL.

    This is required in some places, since different libraries
    have different opinions whether local URLs should start with
    `file:/` or `file:///`.
    """
    return urlunsplit(urlsplit(url))}

def parse_document(content, is_xml, report)

Parse the given XML or HTML document.

Parameters

content: Text to be parsed.
is_xlm: If True, parse as XML, otherwise parse as HTML.
report : Report: Parse errors are logged here.

Returns

tree: A document etree.
None: If the document is too broken to be parsed.

Source code

def parse_document(content, is_xml, report):
    """Parse the given XML or HTML document.

    Parameters:

    content
        Text to be parsed.
    is_xlm
        If `True`, parse as XML, otherwise parse as HTML.
    report: apetest.report.Report
        Parse errors are logged here.

    Returns:

    tree
        A document `etree`.
    None
        If the document is too broken to be parsed.
    """
    parser_factory = etree.XMLParser if is_xml else etree.HTMLParser
    parser = parser_factory(recover=True)

    if is_xml:
        # The lxml parser does not accept encoding in XML declarations
        # when parsing strings.
        content = strip_xml_decl(content)
    try:
        root = etree.fromstring(content, parser)
    except etree.XMLSyntaxError:
        report.error(
            'Failed to parse document as %s; '
            'cannot gather references to other documents.',
            'XML' if is_xml else 'HTML'
            )
        return None

    # The lxml HTML parser is an HTML4 parser. HTML5 is similar enough
    # that it will still be able to produce a document tree, but it will
    # report errors on for example inline SVG.
    if is_xml:
        for error in parser.error_log:
            if hasattr(error, 'line'):
                line = error.line
            elif hasattr(error, 'position'):
                line = error.position[0]
            else:
                line = None

            message = error.message
            if line is not None:
                message += ' (line %d)' % line

            report.error(message)

    return None if root is None else root.getroottree()}

def strip_xml_decl(text)

Strip the XML declaration from the start of the given text.

Returns the given text without XML declaration, or the unmodified text if no XML declaration was found.

Source code

def strip_xml_decl(text):
    """Strip the XML declaration from the start of the given text.

    Returns the given text without XML declaration, or the unmodified text if
    no XML declaration was found.
    """
    match = _RE_XML_DECL.match(text)
    return text if match is None else text[match.end():]}

Classes

class Accept (ancestors: enum.Enum)

The types of documents that we tell the server we accept.

Source code

class Accept(Enum):
    """The types of documents that we tell the server we accept."""

    ANY = 1
    """Accept both HTML and XHTML."""

    HTML = 2
    """Accept only HTML."""}

Class variables

var ANY: Accept both HTML and XHTML.
var HTML: Accept only HTML.

class PageChecker

Retrieves a page, checks its contents and finds references to other pages.

Source code

class PageChecker:
    """Retrieves a page, checks its contents and finds references
    to other pages.
    """

    def __init__(self, base_url, accept, scribe, plugins):
        """Initialize page checker.

        Parameters:

        base_url
            Base URL for the web site or app under test.
        accept: Accept
            The types of documents that we tell the server we accept.
        scribe: apetest.report.Scribe
            Reports will be added here.
        plugins: apetest.plugin.PluginCollection
            Plugins to notify of loaded documents.
        """

        self.base_url = normalize_url(base_url)
        self.accept = accept
        self.scribe = scribe
        self.plugins = plugins

    def short_url(self, url):
        """Return a shortened version of `url`.

        This drops the part of the URL that all pages share.
        """

        assert url.startswith(self.base_url), url
        return url[self.base_url.rindex('/') + 1 : ]

    def check(self, req):
        """Check a single `apetest.request.Request`."""

        req_url = str(req)
        _LOG.info('Checking page: %s', self.short_url(req_url))

        accept = self.accept
        accept_header = {
            # Prefer XHTML to HTML because it is stricter.
            Accept.ANY: 'text/html; q=0.8, application/xhtml+xml; q=1.0',
            Accept.HTML: 'text/html; q=1.0'
            }[accept]

        report, response, content_bytes = load_page(
            req_url, req.maybe_bad, accept_header
            )
        referrers = []

        if response is not None and response.code is not None \
                and 300 <= response.code < 400:
            content_url = normalize_url(response.url)
            if content_url != req_url:
                if content_url.startswith(self.base_url):
                    if not content_url.startswith('file:'):
                        report.info(
                            'Redirected to: %s', self.short_url(content_url)
                            )
                    try:
                        referrers.append(
                            Redirect(Request.from_url(content_url))
                            )
                    except ValueError as ex:
                        report.warning('%s', ex)
                else:
                    report.info('Redirected outside: %s', content_url)

        if content_bytes is None:
            report.info('Could not get any content to check')
            skip_content = True
        elif response.code in (200, None):
            skip_content = False
        else:
            # TODO: This should probably be user-selectable.
            #       A lot of web servers produce error and redirection
            #       notices that are not HTML5 compliant. Checking the
            #       content is likely only useful if the application
            #       under test is producing the content instead.
            report.info(
                'Skipping content check because of HTTP status %d',
                response.code
                )
            skip_content = True

        if skip_content:
            report.checked = True
            self.scribe.add_report(report)
            return referrers

        headers = response.headers
        content_type_header = headers['Content-Type']
        if content_type_header is None:
            message = 'Missing Content-Type header'
            _LOG.error(message)
            report.error(message)
            self.scribe.add_report(report)
            return referrers

        content_type = headers.get_content_type()
        is_html = content_type in ('text/html', 'application/xhtml+xml')
        is_xml = content_type.endswith('/xml') or content_type.endswith('+xml')
        http_encoding = headers.get_content_charset()

        # Speculatively decode the first 1024 bytes, so we can look inside
        # the document for encoding clues.
        bom_encoding = encoding_from_bom(content_bytes)
        content_head = content_bytes[:1024].decode(
            bom_encoding or 'ascii', 'replace'
            )

        if not is_xml and content_head.startswith('<?xml'):
            is_xml = True
            if req_url.startswith('file:'):
                # Silently correct content-type detection for local files.
                # This is not something the user can easily fix, so issuing
                # a warning would not be helpful.
                if content_type == 'text/html':
                    content_type = 'application/xhtml+xml'
            else:
                report.warning(
                    'Document is served with content type "%s" '
                    'but starts with an XML declaration',
                    content_type
                    )

        if not is_xml and not content_type.startswith('text/'):
            self.plugins.resource_loaded(
                content_bytes, content_type_header, report
                )
            message = 'Document of type "%s" is probably not text; ' \
                'skipping.' % content_type
            _LOG.info(message)
            report.info(message)
            report.checked = True # not really, but we just logged why not
            self.scribe.add_report(report)
            return referrers

        if is_html and is_xml and accept is Accept.HTML:
            report.warning(
                'HTML document is serialized as XML, while the HTTP Accept '
                'header did not include "application/xhtml+xml"'
                )

        # Look for encoding in XML declaration.
        decl_encoding = encoding_from_xml_decl(content_head)

        # TODO: Also look at HTML <meta> tags.

        # Try possible encodings in order of precedence.
        # W3C recommends giving the BOM, if present, precedence over HTTP.
        #   http://www.w3.org/International/questions/qa-byte-order-mark
        try:
            content, used_encoding = decode_and_report(
                content_bytes,
                ((bom_encoding, 'Byte Order Mark'),
                 (decl_encoding, 'XML declaration'),
                 (http_encoding, 'HTTP header')),
                report
                )
        except UnicodeDecodeError as ex:
            # All likely encodings failed.
            report.error('Failed to decode contents')
            self.scribe.add_report(report)
            return referrers

        if req_url.startswith('file:'):
            # Construct a new header that is likely more accurate.
            content_type_header = '%s; charset=%s' % (
                content_type, used_encoding
                )
        self.plugins.resource_loaded(content_bytes, content_type_header, report)

        if is_html or is_xml:
            tree = parse_document(content, is_xml, report)
            if tree is not None:
                # Find links to other documents.
                referrers += self.find_referrers_in_xml(tree, req_url, report)
                if is_html:
                    referrers += self.find_referrers_in_html(tree, req_url)

        self.scribe.add_report(report)
        return referrers

    _htmlLinkElements = {
        'a': 'href',
        'link': 'href',
        'img': 'src',
        'script': 'src',
        }
    _xmlLinkElements = {
        '{http://www.w3.org/1999/xhtml}' + tag_name: attr_name
        for tag_name, attr_name in _htmlLinkElements.items()
        }
    # SVG 1.1 uses XLink, but SVG 2 has native 'href' attributes.
    # We're only interested in elements that can link to external
    # resources, not all elements that support 'href'.
    _xmlLinkElements.update({
        '{http://www.w3.org/2000/svg}' + tag_name: 'href'
        for tag_name in ('a', 'image', 'script')
        })
    _xmlLinkElements.update({
        '{http://www.w3.org/2005/Atom}link': 'href'
        })
    # Insert HTML elements without namespace for HTML trees and
    # with namespace for XHTML trees.
    _linkElements = dict(_htmlLinkElements)
    _linkElements.update(_xmlLinkElements)

    def find_urls(self, tree):
        """Yield URLs found in the document `tree`.
        """
        get_attr_name = self._linkElements.__getitem__
        for node in tree.getroot().iter():
            try:
                yield node.attrib[get_attr_name(node.tag)]
            except KeyError:
                pass
            try:
                yield node.attrib['{http://www.w3.org/1999/xlink}href']
            except KeyError:
                pass

    def find_referrers_in_xml(self, tree, tree_url, report):
        """Yield `apetest.referrer.Referrer` objects for links found
        in XML tags in the document `tree`.
        """
        links = defaultdict(LinkSet)
        for url in self.find_urls(tree):
            _LOG.debug(' Found URL: %s', url)
            if url.startswith('?'):
                url = urlsplit(tree_url).path + url
            url = urljoin(tree_url, url)
            if url.startswith(self.base_url):
                try:
                    request = Request.from_url(url)
                except ValueError as ex:
                    report.warning('%s', ex)
                else:
                    links[request.page_url].add(request)
        yield from links.values()

    def find_referrers_in_html(self, tree, url):
        """Yield `apetest.referrer.Referrer` objects for links and forms
        found in HTML tags in the document `tree`.
        """
        root = tree.getroot()
        ns_prefix = '{%s}' % root.nsmap[None] if None in root.nsmap else ''

        for form_node in root.getiterator(ns_prefix + 'form'):
            # TODO: How to handle an empty action?
            #       1. take current path, erase query (current impl)
            #       2. take current path, merge query
            #       3. flag as error (not clearly specced)
            #       I think either flag as error, or mimic the browsers.
            try:
                action = form_node.attrib['action'] or urlsplit(url).path
                method = form_node.attrib['method'].lower()
            except KeyError:
                continue
            if method == 'post':
                # TODO: Support POST (with flag to enable/disable).
                continue
            if method != 'get':
                # The DTD will already have flagged this as a violation.
                continue
            submit_url = urljoin(url, action)
            if not submit_url.startswith(self.base_url):
                continue

            # Note: Disabled controls should not be submitted, so we pretend
            #       they do not even exist.
            controls = []
            radio_buttons = defaultdict(list)
            submit_buttons = []
            for inp in form_node.getiterator(ns_prefix + 'input'):
                control = _parse_input_control(inp.attrib)
                if control is None:
                    pass
                elif isinstance(control, RadioButton):
                    radio_buttons[control.name].append(control)
                elif isinstance(control, SubmitButton):
                    submit_buttons.append(control)
                else:
                    controls.append(control)
            for control in form_node.getiterator(ns_prefix + 'select'):
                name = control.attrib.get('name')
                multiple = control.attrib.get('multiple')
                disabled = 'disabled' in control.attrib
                if disabled:
                    continue
                options = [
                    option.attrib.get('value', option.text)
                    for option in control.getiterator(ns_prefix + 'option')
                    ]
                if multiple:
                    for option in options:
                        controls.append(SelectMultiple(name, option))
                else:
                    controls.append(SelectSingle(name, options))
            for control in form_node.getiterator(ns_prefix + 'textarea'):
                name = control.attrib.get('name')
                value = control.text
                disabled = 'disabled' in control.attrib
                if disabled:
                    continue
                _LOG.debug('textarea "%s": %s', name, value)
                controls.append(TextArea(name, value))

            # Merge exclusive controls.
            for buttons in radio_buttons.values():
                controls.append(RadioButtonGroup(buttons))
            if submit_buttons:
                controls.append(SubmitButtons(submit_buttons))
            # If the form contains no submit buttons, assume it can be
            # submitted using JavaScript, so continue.

            yield Form(submit_url, method, controls)}

Methods

def __init__(self, base_url, accept, scribe, plugins)

Initialize page checker.

Parameters

base_url: Base URL for the web site or app under test.
accept : Accept: The types of documents that we tell the server we accept.
scribe : Scribe: Reports will be added here.
plugins : PluginCollection: Plugins to notify of loaded documents.

Source code

def __init__(self, base_url, accept, scribe, plugins):
    """Initialize page checker.

    Parameters:

    base_url
        Base URL for the web site or app under test.
    accept: Accept
        The types of documents that we tell the server we accept.
    scribe: apetest.report.Scribe
        Reports will be added here.
    plugins: apetest.plugin.PluginCollection
        Plugins to notify of loaded documents.
    """

    self.base_url = normalize_url(base_url)
    self.accept = accept
    self.scribe = scribe
    self.plugins = plugins}

def check(self, req)

Check a single Request.

Source code

def check(self, req):
    """Check a single `apetest.request.Request`."""

    req_url = str(req)
    _LOG.info('Checking page: %s', self.short_url(req_url))

    accept = self.accept
    accept_header = {
        # Prefer XHTML to HTML because it is stricter.
        Accept.ANY: 'text/html; q=0.8, application/xhtml+xml; q=1.0',
        Accept.HTML: 'text/html; q=1.0'
        }[accept]

    report, response, content_bytes = load_page(
        req_url, req.maybe_bad, accept_header
        )
    referrers = []

    if response is not None and response.code is not None \
            and 300 <= response.code < 400:
        content_url = normalize_url(response.url)
        if content_url != req_url:
            if content_url.startswith(self.base_url):
                if not content_url.startswith('file:'):
                    report.info(
                        'Redirected to: %s', self.short_url(content_url)
                        )
                try:
                    referrers.append(
                        Redirect(Request.from_url(content_url))
                        )
                except ValueError as ex:
                    report.warning('%s', ex)
            else:
                report.info('Redirected outside: %s', content_url)

    if content_bytes is None:
        report.info('Could not get any content to check')
        skip_content = True
    elif response.code in (200, None):
        skip_content = False
    else:
        # TODO: This should probably be user-selectable.
        #       A lot of web servers produce error and redirection
        #       notices that are not HTML5 compliant. Checking the
        #       content is likely only useful if the application
        #       under test is producing the content instead.
        report.info(
            'Skipping content check because of HTTP status %d',
            response.code
            )
        skip_content = True

    if skip_content:
        report.checked = True
        self.scribe.add_report(report)
        return referrers

    headers = response.headers
    content_type_header = headers['Content-Type']
    if content_type_header is None:
        message = 'Missing Content-Type header'
        _LOG.error(message)
        report.error(message)
        self.scribe.add_report(report)
        return referrers

    content_type = headers.get_content_type()
    is_html = content_type in ('text/html', 'application/xhtml+xml')
    is_xml = content_type.endswith('/xml') or content_type.endswith('+xml')
    http_encoding = headers.get_content_charset()

    # Speculatively decode the first 1024 bytes, so we can look inside
    # the document for encoding clues.
    bom_encoding = encoding_from_bom(content_bytes)
    content_head = content_bytes[:1024].decode(
        bom_encoding or 'ascii', 'replace'
        )

    if not is_xml and content_head.startswith('<?xml'):
        is_xml = True
        if req_url.startswith('file:'):
            # Silently correct content-type detection for local files.
            # This is not something the user can easily fix, so issuing
            # a warning would not be helpful.
            if content_type == 'text/html':
                content_type = 'application/xhtml+xml'
        else:
            report.warning(
                'Document is served with content type "%s" '
                'but starts with an XML declaration',
                content_type
                )

    if not is_xml and not content_type.startswith('text/'):
        self.plugins.resource_loaded(
            content_bytes, content_type_header, report
            )
        message = 'Document of type "%s" is probably not text; ' \
            'skipping.' % content_type
        _LOG.info(message)
        report.info(message)
        report.checked = True # not really, but we just logged why not
        self.scribe.add_report(report)
        return referrers

    if is_html and is_xml and accept is Accept.HTML:
        report.warning(
            'HTML document is serialized as XML, while the HTTP Accept '
            'header did not include "application/xhtml+xml"'
            )

    # Look for encoding in XML declaration.
    decl_encoding = encoding_from_xml_decl(content_head)

    # TODO: Also look at HTML <meta> tags.

    # Try possible encodings in order of precedence.
    # W3C recommends giving the BOM, if present, precedence over HTTP.
    #   http://www.w3.org/International/questions/qa-byte-order-mark
    try:
        content, used_encoding = decode_and_report(
            content_bytes,
            ((bom_encoding, 'Byte Order Mark'),
             (decl_encoding, 'XML declaration'),
             (http_encoding, 'HTTP header')),
            report
            )
    except UnicodeDecodeError as ex:
        # All likely encodings failed.
        report.error('Failed to decode contents')
        self.scribe.add_report(report)
        return referrers

    if req_url.startswith('file:'):
        # Construct a new header that is likely more accurate.
        content_type_header = '%s; charset=%s' % (
            content_type, used_encoding
            )
    self.plugins.resource_loaded(content_bytes, content_type_header, report)

    if is_html or is_xml:
        tree = parse_document(content, is_xml, report)
        if tree is not None:
            # Find links to other documents.
            referrers += self.find_referrers_in_xml(tree, req_url, report)
            if is_html:
                referrers += self.find_referrers_in_html(tree, req_url)

    self.scribe.add_report(report)
    return referrers}

def find_referrers_in_html(self, tree, url)

Yield Referrer objects for links and forms found in HTML tags in the document tree.

Source code

def find_referrers_in_html(self, tree, url):
    """Yield `apetest.referrer.Referrer` objects for links and forms
    found in HTML tags in the document `tree`.
    """
    root = tree.getroot()
    ns_prefix = '{%s}' % root.nsmap[None] if None in root.nsmap else ''

    for form_node in root.getiterator(ns_prefix + 'form'):
        # TODO: How to handle an empty action?
        #       1. take current path, erase query (current impl)
        #       2. take current path, merge query
        #       3. flag as error (not clearly specced)
        #       I think either flag as error, or mimic the browsers.
        try:
            action = form_node.attrib['action'] or urlsplit(url).path
            method = form_node.attrib['method'].lower()
        except KeyError:
            continue
        if method == 'post':
            # TODO: Support POST (with flag to enable/disable).
            continue
        if method != 'get':
            # The DTD will already have flagged this as a violation.
            continue
        submit_url = urljoin(url, action)
        if not submit_url.startswith(self.base_url):
            continue

        # Note: Disabled controls should not be submitted, so we pretend
        #       they do not even exist.
        controls = []
        radio_buttons = defaultdict(list)
        submit_buttons = []
        for inp in form_node.getiterator(ns_prefix + 'input'):
            control = _parse_input_control(inp.attrib)
            if control is None:
                pass
            elif isinstance(control, RadioButton):
                radio_buttons[control.name].append(control)
            elif isinstance(control, SubmitButton):
                submit_buttons.append(control)
            else:
                controls.append(control)
        for control in form_node.getiterator(ns_prefix + 'select'):
            name = control.attrib.get('name')
            multiple = control.attrib.get('multiple')
            disabled = 'disabled' in control.attrib
            if disabled:
                continue
            options = [
                option.attrib.get('value', option.text)
                for option in control.getiterator(ns_prefix + 'option')
                ]
            if multiple:
                for option in options:
                    controls.append(SelectMultiple(name, option))
            else:
                controls.append(SelectSingle(name, options))
        for control in form_node.getiterator(ns_prefix + 'textarea'):
            name = control.attrib.get('name')
            value = control.text
            disabled = 'disabled' in control.attrib
            if disabled:
                continue
            _LOG.debug('textarea "%s": %s', name, value)
            controls.append(TextArea(name, value))

        # Merge exclusive controls.
        for buttons in radio_buttons.values():
            controls.append(RadioButtonGroup(buttons))
        if submit_buttons:
            controls.append(SubmitButtons(submit_buttons))
        # If the form contains no submit buttons, assume it can be
        # submitted using JavaScript, so continue.

        yield Form(submit_url, method, controls)}

def find_referrers_in_xml(self, tree, tree_url, report)

Yield Referrer objects for links found in XML tags in the document tree.

Source code

def find_referrers_in_xml(self, tree, tree_url, report):
    """Yield `apetest.referrer.Referrer` objects for links found
    in XML tags in the document `tree`.
    """
    links = defaultdict(LinkSet)
    for url in self.find_urls(tree):
        _LOG.debug(' Found URL: %s', url)
        if url.startswith('?'):
            url = urlsplit(tree_url).path + url
        url = urljoin(tree_url, url)
        if url.startswith(self.base_url):
            try:
                request = Request.from_url(url)
            except ValueError as ex:
                report.warning('%s', ex)
            else:
                links[request.page_url].add(request)
    yield from links.values()}

def find_urls(self, tree)

Yield URLs found in the document tree.

Source code

def find_urls(self, tree):
    """Yield URLs found in the document `tree`.
    """
    get_attr_name = self._linkElements.__getitem__
    for node in tree.getroot().iter():
        try:
            yield node.attrib[get_attr_name(node.tag)]
        except KeyError:
            pass
        try:
            yield node.attrib['{http://www.w3.org/1999/xlink}href']
        except KeyError:
            pass}

def short_url(self, url)

Return a shortened version of url.

This drops the part of the URL that all pages share.

Source code

def short_url(self, url):
    """Return a shortened version of `url`.

    This drops the part of the URL that all pages share.
    """

    assert url.startswith(self.base_url), url
    return url[self.base_url.rindex('/') + 1 : ]}