from __future__ import absolute_import

import re


_HTTP_RE = re.compile(r'HTTP Error\s+(\d{3})', re.IGNORECASE)


def extract_http_status(text):
    try:
        s = str(text or '')
    except Exception:
        s = ''
    m = _HTTP_RE.search(s)
    if not m:
        return None
    try:
        return int(m.group(1))
    except Exception:
        return None


def classify_error(error_text, traceback_text=None):
    """Return (tags, http_status, kind) for a failure.

    Tags are short strings intended for filtering/sanitization.
    """
    try:
        err = str(error_text or '')
    except Exception:
        err = ''
    try:
        tb = str(traceback_text or '')
    except Exception:
        tb = ''

    hay = (err + '\n' + tb).lower()
    tags = set(['failed'])

    http_status = extract_http_status(err) or extract_http_status(tb)
    kind = ''

    if http_status is not None:
        tags.add('http-%d' % int(http_status))
        if http_status == 404:
            tags.add('not-found')
            kind = kind or 'not-found'
        elif http_status == 403:
            tags.add('forbidden')
            kind = kind or 'forbidden'
        elif http_status == 401:
            tags.add('auth')
            kind = kind or 'auth'
        elif http_status == 429:
            tags.add('rate-limited')
            kind = kind or 'rate-limited'
        elif http_status in (500, 502, 503, 520, 521, 522, 524, 525, 526):
            tags.add('server-error')
            kind = kind or 'server-error'
        elif http_status == 504:
            tags.add('timeout')
            tags.add('server-error')
            kind = kind or 'timeout'
        elif 300 <= http_status < 400:
            tags.add('redirect')
            kind = kind or 'redirect'

    # DNS / connectivity
    if ('getaddrinfo failed' in hay or
        'name or service not known' in hay or
        'nodename nor servname provided' in hay or
        'errno 11001' in hay or
        'errno 11002' in hay):
        tags.add('dns-fail')
        kind = kind or 'dns-fail'

    # Timeout / connect failures
    if ('timed out' in hay or
        'timeout' in hay or
        'winerror 10060' in hay or
        'a connection attempt failed' in hay):
        tags.add('timeout')
        kind = kind or 'timeout'

    # SSL / TLS
    if ('certificate_verify_failed' in hay or
        'ssl:' in hay and 'certificate' in hay or
        'self-signed certificate' in hay or
        'hostname mismatch' in hay or
        'certificate has expired' in hay):
        tags.add('ssl-error')
        kind = kind or 'ssl-error'

    # Redirect loop
    if 'redirect error that would lead to an infinite loop' in hay:
        tags.add('redirect-loop')
        kind = kind or 'redirect-loop'

    # Invalid URL
    if ('url can\'t contain control characters' in hay or
        'invalid url' in hay or
        'unknown url type' in hay):
        tags.add('invalid-url')
        kind = kind or 'invalid-url'

    # Parse errors
    if ('no element found' in hay or
        'not well-formed' in hay or
        'syntax error' in hay or
        'mismatched tag' in hay or
        'undefined entity' in hay):
        tags.add('parse-error')
        kind = kind or 'parse-error'

    # A common special case: parse error at line 1/col 0 often means HTML/block page
    if 'no element found: line 1, column 0' in hay or 'syntax error: line 1, column 0' in hay:
        tags.add('not-xml')
        kind = kind or 'not-xml'

    return sorted(tags), http_status, (kind or '')
