from __future__ import absolute_import

import datetime
import email.utils
import gzip
import re
import ssl
import time
import threading
import urllib.error
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET


# --- HTTP behavior ---
# We want to be compatible with real-world feeds without being overly aggressive.
# Strategy:
# 1) Use a stable, common desktop browser UA (prefer calibre's UA list) to avoid
#    immediate blocks for obvious script UAs.
# 2) Use a per-thread opener with a cookie jar (helps with redirects/consent flows).
# 3) Retry with a small backoff for transient failures.
# 4) If we see bot-like blocks (403/429/503/etc) repeatedly, fall back to
#    calibre's mechanize browser for that host.


_TLS = threading.local()
_UA = None
_FAILS_BY_HOST = {}  # netloc -> int


def _stable_user_agent():
    global _UA
    if _UA is not None:
        return _UA
    # Prefer calibre's curated random UA pool, but keep it stable for the whole
    # session (randomizing per-request can look suspicious).
    try:
        from calibre import random_user_agent
        _UA = random_user_agent(allow_ie=False)
    except Exception:
        _UA = 'calibre-rss-reader/0.1'
    return _UA


def _get_thread_opener(user_agent=None):
    try:
        opener = getattr(_TLS, 'opener', None)
    except Exception:
        opener = None

    ua = (user_agent or _stable_user_agent()).strip() or 'calibre-rss-reader/0.1'
    if opener is not None:
        try:
            cur_ua = getattr(_TLS, 'ua', None)
        except Exception:
            cur_ua = None
        if cur_ua == ua:
            return opener

    # Create a per-thread cookie jar opener
    try:
        import http.cookiejar
        cj = http.cookiejar.CookieJar()
        opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
    except Exception:
        opener = urllib.request.build_opener()

    try:
        _TLS.opener = opener
        _TLS.ua = ua
    except Exception:
        pass
    return opener


def _should_fallback_to_mechanize(http_status=None, exc=None):
    # Typical bot/throttle responses. Some proxies/CDNs also use 520+.
    try:
        code = int(http_status) if http_status is not None else None
    except Exception:
        code = None
    if code in (403, 407, 409, 418, 429, 451, 503, 520, 521, 522, 523, 524):
        return True
    # Some failures manifest as SSL/proxy resets etc. We don't want to fallback
    # for everything, but for common transient network errors we will just retry.
    # Mechanize fallback is reserved for the HTTP bot/throttle class above.
    return False


def _mechanize_fetch(url, timeout_seconds=12, headers=None, user_agent=None):
    # Uses calibre's scraping browser (mechanize) which handles cookies/refresh and
    # uses a common UA. This is heavier than urllib; we only use it for blocked hosts.
    try:
        from calibre import browser
    except Exception:
        raise

    ua = (user_agent or _stable_user_agent()).strip() or 'calibre-rss-reader/0.1'
    br = browser(user_agent=ua, verify_ssl_certificates=True)
    # Be conservative with headers. Mechanize already adds UA.
    try:
        br.addheaders += [('Accept', (headers or {}).get('Accept') or '*/*')]
    except Exception:
        pass
    try:
        # Helps for some servers; ok to ignore if unsupported.
        br.set_handle_gzip(True)
    except Exception:
        pass

    with br.open(url, timeout=timeout_seconds) as resp:
        raw = resp.read()
        try:
            final_url = resp.geturl()
        except Exception:
            final_url = url
        return raw, final_url


def _strip(text):
    return (text or '').strip()


def _first_text(elem, candidates):
    if elem is None:
        return ''
    for cand in candidates:
        child = elem.find(cand)
        if child is not None and child.text:
            return _strip(child.text)
    return ''


def _iter_children_by_localname(elem, localname):
    if elem is None:
        return
    for child in list(elem):
        tag = child.tag
        if isinstance(tag, str) and tag.endswith('}' + localname):
            yield child
        elif tag == localname:
            yield child


def _text_by_localname(elem, localname):
    for child in _iter_children_by_localname(elem, localname):
        if child.text:
            return _strip(child.text)
    return ''


def _attr_by_localname(elem, localname, attr):
    for child in _iter_children_by_localname(elem, localname):
        val = child.attrib.get(attr)
        if val:
            return _strip(val)
    return ''


def _parse_date_to_iso(dt_text):
    dt_text = _strip(dt_text)
    if not dt_text:
        return ''

    # RSS: RFC822-ish
    try:
        tup = email.utils.parsedate_tz(dt_text)
        if tup:
            ts = email.utils.mktime_tz(tup)
            return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc).isoformat()
    except Exception:
        pass

    # Some Atom/feeds use a space separator instead of 'T', e.g.
    #  2025-12-04 09:24:59 -0300
    # Try common space-separated variants before strict ISO attempts.
    try:
        for fmt in ('%Y-%m-%d %H:%M:%S %z', '%Y-%m-%d %H:%M:%S'):
            try:
                dt = datetime.datetime.strptime(dt_text, fmt)
                if dt.tzinfo is None:
                    # Treat naive datetimes as UTC
                    dt = dt.replace(tzinfo=datetime.timezone.utc)
                return dt.astimezone(datetime.timezone.utc).isoformat()
            except Exception:
                continue
    except Exception:
        pass

    # Atom: ISO 8601
    for fmt in (
        '%Y-%m-%dT%H:%M:%SZ',
        '%Y-%m-%dT%H:%M:%S%z',
        '%Y-%m-%dT%H:%M:%S.%fZ',
        '%Y-%m-%dT%H:%M:%S.%f%z',
    ):
        try:
            dt = datetime.datetime.strptime(dt_text, fmt)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=datetime.timezone.utc)
            return dt.astimezone(datetime.timezone.utc).isoformat()
        except Exception:
            continue

    return ''


def fetch_url(url, timeout_seconds=12, headers=None):
    # Default headers for feed fetching
    base_headers = {
        'User-Agent': _stable_user_agent(),
        'Accept': 'application/rss+xml, application/atom+xml, application/xml, text/xml, */*;q=0.8',
        'Accept-Encoding': 'gzip',
    }

    if headers and isinstance(headers, dict):
        try:
            base_headers.update(headers)
        except Exception:
            pass

    # Per-host failure tracking to decide when to prefer calibre's mechanize browser
    try:
        netloc = urllib.parse.urlparse(url or '').netloc.lower()
    except Exception:
        netloc = ''
    try:
        fail_count = int(_FAILS_BY_HOST.get(netloc, 0) or 0)
    except Exception:
        fail_count = 0

    # Retry/backoff for transient failures
    max_attempts = 3
    backoff_s = 0.35
    last_exc = None

    # If this host has repeatedly returned bot-like responses, go straight to mechanize.
    prefer_mechanize = bool(netloc and fail_count >= 2)

    for attempt in range(max_attempts):
        try:
            if prefer_mechanize:
                raw, final = _mechanize_fetch(url, timeout_seconds=timeout_seconds, headers=base_headers, user_agent=base_headers.get('User-Agent'))
                # Reset failures on success
                if netloc:
                    _FAILS_BY_HOST[netloc] = 0
                return raw, final

            req = urllib.request.Request(url, headers=base_headers)
            ctx = ssl.create_default_context()
            opener = _get_thread_opener(user_agent=base_headers.get('User-Agent'))
            with opener.open(req, timeout=timeout_seconds) as resp:
                raw = resp.read()
                try:
                    encoding = (resp.headers.get('Content-Encoding') or '').lower()
                except Exception:
                    encoding = ''
                if 'gzip' in encoding:
                    try:
                        raw = gzip.decompress(raw)
                    except Exception:
                        pass
                try:
                    final = resp.geturl()
                except Exception:
                    final = url
                if netloc:
                    _FAILS_BY_HOST[netloc] = 0
                return raw, final

        except urllib.error.HTTPError as e:
            last_exc = e
            code = getattr(e, 'code', None)

            if _should_fallback_to_mechanize(http_status=code, exc=e):
                # Mark host as problematic and try mechanize immediately.
                if netloc:
                    _FAILS_BY_HOST[netloc] = int(_FAILS_BY_HOST.get(netloc, 0) or 0) + 1
                try:
                    raw, final = _mechanize_fetch(url, timeout_seconds=timeout_seconds, headers=base_headers, user_agent=base_headers.get('User-Agent'))
                    if netloc:
                        _FAILS_BY_HOST[netloc] = 0
                    return raw, final
                except Exception as e2:
                    last_exc = e2

            # Otherwise retry with backoff
            if attempt < max_attempts - 1:
                try:
                    time.sleep(backoff_s * (2 ** attempt))
                except Exception:
                    pass
                continue
            raise

        except Exception as e:
            last_exc = e
            if attempt < max_attempts - 1:
                try:
                    time.sleep(backoff_s * (2 ** attempt))
                except Exception:
                    pass
                continue
            raise

    # Should not reach here, but keep a useful exception
    if last_exc is not None:
        raise last_exc
    raise Exception('Failed to fetch URL')


_XML_DECL_RE = re.compile(br'^\s*<\?xml\s+[^>]*encoding\s*=\s*["\']([^"\']+)["\']', re.I)


def _detect_xml_encoding(xml_bytes):
    """Best-effort encoding detection for feeds.

    Returns an empty string when unknown.
    """
    b = xml_bytes or b''
    if not isinstance(b, (bytes, bytearray)):
        return ''
    try:
        b = bytes(b)
    except Exception:
        return ''

    # Prefer encoding declared in XML header.
    try:
        m = _XML_DECL_RE.match(b[:240])
        if m:
            enc = (m.group(1) or b'').decode('ascii', 'ignore').strip()
            if enc:
                return str(enc)
    except Exception:
        pass

    # Secondary hint: calibre ships a chardet.
    try:
        from calibre.ebooks.chardet import detect as calibre_detect
        enc = (calibre_detect(b) or {}).get('encoding')
        return str(enc or '').strip()
    except Exception:
        return ''


def _feed_type_from_root(root):
    try:
        tag = root.tag or ''
        local = tag.rsplit('}', 1)[-1] if isinstance(tag, str) and '}' in tag else tag
    except Exception:
        local = ''

    try:
        if local == 'feed':
            return 'ATOM 1.0'
        if str(local).lower() == 'rss':
            v = ''
            try:
                v = str(root.attrib.get('version') or '').strip()
            except Exception:
                v = ''
            if v.startswith('0.9'):
                return 'RSS 0.91/0.92/0.93'
            if v.startswith('2'):
                return 'RSS 2.0/2.0.1'
            return ('RSS ' + v).strip() if v else 'RSS'
        if str(local).lower() == 'rdf':
            return 'RDF (RSS 1.0)'
    except Exception:
        pass
    return ''


def _sanitize_xml_text(text):
    try:
        s = str(text or '')
    except Exception:
        s = ''
    if not s:
        return ''
    # Remove illegal XML 1.0 characters (control chars). Keep tabs/newlines.
    try:
        s = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', s)
    except Exception:
        pass
    try:
        s = s.lstrip('\ufeff').lstrip()
    except Exception:
        pass
    return s


def _decode_xml_bytes(xml_bytes):
    b = xml_bytes or b''
    if isinstance(b, str):
        return b
    try:
        b = bytes(b)
    except Exception:
        b = b''

    # Prefer encoding declared in XML header.
    enc = None
    try:
        m = _XML_DECL_RE.match(b[:240])
        if m:
            enc = (m.group(1) or b'').decode('ascii', 'ignore').strip()
    except Exception:
        enc = None

    # Calibre ships a chardet; try it as a secondary hint.
    if not enc:
        try:
            from calibre.ebooks.chardet import detect as calibre_detect
            enc = (calibre_detect(b) or {}).get('encoding')
        except Exception:
            enc = None

    encodings_to_try = []
    if enc:
        encodings_to_try.append(str(enc))
    # Common feed encodings.
    encodings_to_try += ['utf-8', 'utf-16', 'windows-1252', 'windows-1251', 'iso-8859-1', 'latin1']

    for e in encodings_to_try:
        if not e:
            continue
        try:
            return b.decode(e, errors='replace')
        except Exception:
            continue
    try:
        return b.decode('utf-8', errors='replace')
    except Exception:
        return ''


def parse_feed(xml_bytes, base_url=''):
    feed_encoding = _detect_xml_encoding(xml_bytes)
    data = xml_bytes or b''
    try:
        if isinstance(data, (bytes, bytearray)) and data.startswith(b'\xef\xbb\xbf'):
            data = data[3:]
    except Exception:
        pass
    try:
        if isinstance(data, (bytes, bytearray)):
            data = data.lstrip()
    except Exception:
        pass

    # Primary parse path
    try:
        root = ET.fromstring(data)
    except Exception:
        # Fallback: decode with best-effort, sanitize illegal chars, and retry.
        # This helps with real-world feeds that contain bad bytes/control chars.
        decoded = _decode_xml_bytes(data)
        sanitized = _sanitize_xml_text(decoded)
        root = ET.fromstring(sanitized)

    feed_type = _feed_type_from_root(root)

    tag = root.tag or ''
    if isinstance(tag, str) and tag.endswith('}feed') or tag == 'feed':
        out = _parse_atom(root, base_url)
        try:
            out['feed_type'] = feed_type
            out['feed_encoding'] = feed_encoding
        except Exception:
            pass
        return out

    # RSS 2.0: <rss><channel>...
    if tag == 'rss' or (isinstance(tag, str) and tag.endswith('}rss')):
        channel = root.find('channel')
        if channel is None:
            # try localname
            for c in _iter_children_by_localname(root, 'channel'):
                channel = c
                break
        out = _parse_rss_channel(channel or root, base_url)
        try:
            out['feed_type'] = feed_type
            out['feed_encoding'] = feed_encoding
        except Exception:
            pass
        return out

    # RSS 1.0 RDF: <rdf:RDF> with <channel> and <item>
    out = _parse_rdf(root, base_url)
    try:
        out['feed_type'] = feed_type
        out['feed_encoding'] = feed_encoding
    except Exception:
        pass
    return out


def _parse_atom(feed_elem, base_url):
    title = _text_by_localname(feed_elem, 'title')

    image_url = _text_by_localname(feed_elem, 'icon')
    if base_url and image_url:
        image_url = urllib.parse.urljoin(base_url, image_url)
    link = ''
    for l in _iter_children_by_localname(feed_elem, 'link'):
        rel = (l.attrib.get('rel') or '').strip().lower()
        href = (l.attrib.get('href') or '').strip()
        if not href:
            continue
        if rel in ('', 'alternate'):
            link = href
            break
    link = urllib.parse.urljoin(base_url, link) if base_url else link

    entries = []
    for entry in _iter_children_by_localname(feed_elem, 'entry'):
        etitle = _text_by_localname(entry, 'title')

        # Author (Atom: <author><name>..)
        author = ''
        try:
            for a in _iter_children_by_localname(entry, 'author'):
                nm = _text_by_localname(a, 'name')
                if nm:
                    author = nm
                    break
                em = _text_by_localname(a, 'email')
                if em:
                    author = em
                    break
        except Exception:
            author = ''

        eid = _text_by_localname(entry, 'id')
        if not eid:
            eid = _attr_by_localname(entry, 'link', 'href')

        elink = ''
        enclosures = []
        for l in _iter_children_by_localname(entry, 'link'):
            rel = (l.attrib.get('rel') or '').strip().lower()
            href = (l.attrib.get('href') or '').strip()
            if not href:
                continue
            if rel in ('', 'alternate') and not elink:
                elink = href
                continue
            if rel == 'enclosure':
                etype = (l.attrib.get('type') or '').strip().lower()
                eurl = href
                if base_url and eurl:
                    eurl = urllib.parse.urljoin(base_url, eurl)
                if eurl:
                    enclosures.append({'url': eurl, 'type': etype, 'length': (l.attrib.get('length') or '').strip()})
        elink = urllib.parse.urljoin(base_url, elink) if base_url else elink

        updated = _parse_date_to_iso(_text_by_localname(entry, 'updated') or _text_by_localname(entry, 'published'))
        summary = _text_by_localname(entry, 'summary')
        content = ''
        # Atom feeds often put the real/verbose HTML in <content type="html"> and a short blurb in <summary>.
        # Prefer <content> when present (e.g. Standard Ebooks), but keep both.
        try:
            for c in _iter_children_by_localname(entry, 'content'):
                if c.text:
                    content = _strip(c.text)
                    break
        except Exception:
            content = ''

        # Choose the more informative field for display.
        try:
            if content and (not summary or len(content) > len(summary) + 40):
                summary = content
        except Exception:
            pass

        entries.append({
            'id': _strip(eid) or _strip(elink) or _strip(etitle),
            'title': _strip(etitle),
            'link': _strip(elink),
            'author': _strip(author),
            'published': updated,
            'summary': summary,
            'content': content,
            'enclosures': enclosures,
        })

    return {'title': title, 'link': link, 'items': entries, 'image_url': image_url}


def _parse_rss_channel(channel, base_url):
    title = _first_text(channel, ['title']) or _text_by_localname(channel, 'title')
    link = _first_text(channel, ['link']) or _text_by_localname(channel, 'link')
    link = urllib.parse.urljoin(base_url, link) if base_url else link

    image_url = ''
    try:
        img = channel.find('image')
        if img is None:
            for x in _iter_children_by_localname(channel, 'image'):
                img = x
                break
        if img is not None:
            image_url = _first_text(img, ['url']) or _text_by_localname(img, 'url')
            image_url = (image_url or '').strip()
            if base_url and image_url:
                image_url = urllib.parse.urljoin(base_url, image_url)
    except Exception:
        image_url = ''

    items = []
    for item in channel.findall('item') + list(_iter_children_by_localname(channel, 'item')):
        ititle = _first_text(item, ['title']) or _text_by_localname(item, 'title')
        ilink = _first_text(item, ['link']) or _text_by_localname(item, 'link')
        guid = _first_text(item, ['guid']) or _text_by_localname(item, 'guid')
        pub = _first_text(item, ['pubDate']) or _text_by_localname(item, 'pubDate')
        pub_iso = _parse_date_to_iso(pub)

        # Author (RSS: <dc:creator> or <author>)
        author = ''
        try:
            author = _text_by_localname(item, 'creator') or _first_text(item, ['author']) or _text_by_localname(item, 'author')
        except Exception:
            author = ''
        # Common RSS <author> patterns include emails. Try to extract a human name.
        try:
            a = (author or '').strip()
            if a:
                m = re.search(r'\(([^\)]+)\)', a)
                if m and m.group(1).strip():
                    author = m.group(1).strip()
                else:
                    m2 = re.search(r'([^<]+)\s*<[^>]+>', a)
                    if m2 and m2.group(1).strip():
                        author = m2.group(1).strip()
        except Exception:
            pass

        # RSS often provides a short <description> and the real/verbose HTML in
        # <content:encoded>. Prefer encoded when it looks substantially richer.
        desc = _first_text(item, ['description']) or _text_by_localname(item, 'description')
        encoded = _text_by_localname(item, 'encoded')  # content:encoded localname
        try:
            d = _strip(desc)
            e = _strip(encoded)
            if e and (not d or len(e) > len(d) + 40):
                desc = e
        except Exception:
            pass

        enclosures = []
        try:
            for enc in list(item.findall('enclosure')) + list(_iter_children_by_localname(item, 'enclosure')):
                try:
                    eurl = (enc.attrib.get('url') or '').strip()
                    etype = (enc.attrib.get('type') or '').strip().lower()
                    elen = (enc.attrib.get('length') or '').strip()
                    if base_url and eurl:
                        eurl = urllib.parse.urljoin(base_url, eurl)
                    if eurl:
                        enclosures.append({'url': eurl, 'type': etype, 'length': elen})
                except Exception:
                    continue
        except Exception:
            enclosures = []

        ilink = urllib.parse.urljoin(base_url, ilink) if base_url else ilink

        item_id = _strip(guid) or _strip(ilink) or _strip(ititle)
        items.append({
            'id': item_id,
            'title': _strip(ititle),
            'link': _strip(ilink),
            'author': _strip(author),
            'published': pub_iso,
            'summary': desc,
            'content': _strip(encoded),
            'enclosures': enclosures,
        })

    return {'title': title, 'link': link, 'items': items, 'image_url': image_url}


def _parse_rdf(root, base_url):
    # Find channel title/link if present
    channel = None
    for c in _iter_children_by_localname(root, 'channel'):
        channel = c
        break
    if channel is None:
        channel = root

    title = _text_by_localname(channel, 'title')
    link = _text_by_localname(channel, 'link')
    link = urllib.parse.urljoin(base_url, link) if base_url else link

    items = []
    for item in _iter_children_by_localname(root, 'item'):
        ititle = _text_by_localname(item, 'title')
        ilink = _text_by_localname(item, 'link')
        ilink = urllib.parse.urljoin(base_url, ilink) if base_url else ilink

        # Author: commonly <dc:creator>
        author = ''
        try:
            author = _text_by_localname(item, 'creator') or _text_by_localname(item, 'author')
        except Exception:
            author = ''

        pub_iso = _parse_date_to_iso(_text_by_localname(item, 'date'))
        desc = _text_by_localname(item, 'description')

        item_id = _strip(ilink) or _strip(ititle)
        items.append({
            'id': item_id,
            'title': _strip(ititle),
            'link': _strip(ilink),
            'author': _strip(author),
            'published': pub_iso,
            'summary': desc,
        })

    return {'title': title, 'link': link, 'items': items, 'image_url': ''}


def normalize_summary_to_html(summary_text):
    # Very small sanitizer/formatter: escape bare text and convert URLs to links.
    txt = summary_text or ''

    # Some feeds incorrectly embed literal CDATA markers inside text, e.g.
    #   &lt;![CDATA[&lt;img ...&gt;...]]&gt;
    # which becomes '<![CDATA[<img ...>...]]>' after html.unescape() in the UI.
    # Strip these wrappers so the embedded HTML can render.
    try:
        s = str(txt or '')
        # Handle both escaped and unescaped markers.
        for _ in range(2):
            ss = s.strip()
            if ss.startswith('&lt;![CDATA['):
                # Remove leading marker (keep inner content as-is).
                s = ss[len('&lt;![CDATA['):]
                continue
            if ss.startswith('<![CDATA['):
                s = ss[len('<![CDATA['):]
                continue
            break
        for _ in range(2):
            ss = s.strip()
            if ss.endswith(']]&gt;'):
                s = ss[:-len(']]&gt;')]
                continue
            if ss.endswith(']]>'):
                s = ss[:-len(']]>')]
                continue
            break
        txt = s
    except Exception:
        pass
    if '<' in txt and '>' in txt:
        # Clean CSS issues: replace -webkit-calc with calc
        try:
            txt = re.sub(r'-webkit-calc\(', 'calc(', txt)
            # Remove empty src attributes
            txt = re.sub(r'<img[^>]*src=""[^>]*>', '', txt, flags=re.IGNORECASE)
            # Replace audio links with inline audio players
            def replace_audio_link(match):
                href = match.group(1)
                return '<audio controls><source src="%s" type="audio/mpeg"></audio>' % href
            txt = re.sub(r'<a[^>]*href="([^\"]*\.(mp3|wav|ogg|aac|m4a))"[^>]*>.*?</a>', replace_audio_link, txt, flags=re.IGNORECASE)
        except Exception:
            pass
        return txt

    def esc(s):
        return (s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;'))

    def is_audio_url(u):
        try:
            return bool(re.search(r'\.(mp3|wav|ogg|aac|m4a)$', u, re.IGNORECASE))
        except Exception:
            return False

    url_re = re.compile(r'(https?://\S+)')
    parts = []
    pos = 0
    for m in url_re.finditer(txt):
        parts.append(esc(txt[pos:m.start()]))
        u = m.group(1)
        if is_audio_url(u):
            parts.append('<audio controls><source src="%s" type="audio/mpeg"></audio>' % esc(u))
        else:
            parts.append('<a href="%s">%s</a>' % (esc(u), esc(u)))
        pos = m.end()
    parts.append(esc(txt[pos:]))
    return '<pre style="white-space:pre-wrap">%s</pre>' % ''.join(parts)
