from __future__ import absolute_import

try:
    load_translations()
except NameError:
    pass

# ExportWorker moved from ui.py
from PyQt5.QtCore import QObject, pyqtSignal
import tempfile, os

# Helper: safe filename builder (avoid importing from utils which may not expose it)
def _safe_filename(s):
    s = (s or '').strip()
    if not s:
        return 'news'
    for ch in '<>:"/\\|?*':
        s = s.replace(ch, '_')
    s = ' '.join(s.split())
    return s[:140].strip() or 'news'

from calibre_plugins.rss_reader.rss import normalize_summary_to_html
try:
    # Use the same wrapper as the UI export paths so we get:
    # - consistent URL sanitization (e.g. &amp;, protocol-relative URLs)
    # - user prefs (download uncached images, optimize thresholds)
    # - the same cache_dir used by PreviewBrowser
    from calibre_plugins.rss_reader.preview_browser import _process_images_for_export
except Exception:
    def _process_images_for_export(html_str, base_url, td, do_download=True, images_subdir=None, **kwargs):
        return html_str

class ExportWorker(QObject):
    """Background worker for export image processing to avoid blocking UI."""
    progress = pyqtSignal(str)  # Progress message
    finished = pyqtSignal(dict)  # Result data: {'html_path': str, 'td': str, 'news_title': str, 'comments_html': str, 'convert_ext': str, 'final_ext': str, 'markdown_via_txt': bool, 'add_to_library': bool, 'out_path': str, 'final_out_path': str or None}
    start = pyqtSignal()  # Signal to start the work

    def __init__(self, export_data):
        super().__init__()
        self.export_data = export_data  # Dict with all needed data
        self.result = None  # Store result for thread finished handler
        self._cancelled = False

    def cancel(self):
        try:
            self._cancelled = True
        except Exception:
            pass
    def run(self):
        try:
            # Extract data
            feed_ids = self.export_data['feed_ids']
            cache = self.export_data['cache']
            feeds_by_id = self.export_data['feeds_by_id']
            news_title = self.export_data['news_title']
            td = self.export_data['td']

            # Precompute totals for clearer progress messages
            try:
                total_feeds = len(feed_ids or [])
            except Exception:
                total_feeds = 0
            try:
                total_articles = 0
                for fid in (feed_ids or []):
                    try:
                        total_articles += len((cache.get(fid, {}) or {}).get('items') or [])
                    except Exception:
                        pass
            except Exception:
                total_articles = 0

            try:
                if total_feeds and total_articles:
                    self.progress.emit(_('Preparing export (%d feeds, %d articles)...') % (total_feeds, total_articles))
                elif total_feeds:
                    self.progress.emit(_('Preparing export (%d feeds)...') % total_feeds)
                else:
                    self.progress.emit(_('Preparing export...'))
            except Exception:
                self.progress.emit(_('Preparing export...'))

            try:
                from calibre_plugins.rss_reader.export_oeb import build_oeb_periodical, append_enclosure_images
            except Exception:
                build_oeb_periodical = None
                append_enclosure_images = None

            if build_oeb_periodical is None:
                error_result = {'error': _('Export helpers unavailable.')}
                self.result = error_result
                self.finished.emit(error_result)
                return

            export_feeds = []

            any_items = False
            feed_idx = 0
            processed_articles = 0
            for fid in feed_ids:
                feed_idx += 1
                if self._cancelled:
                    error_result = {'error': _('Export cancelled.')}
                    self.result = error_result
                    self.finished.emit(error_result)
                    return
                entry = cache.get(fid, {})
                feed_title = entry.get('title') or (feeds_by_id.get(fid) or {}).get('title') or _('(untitled feed)')
                feed_bucket = {'title': feed_title, 'items': []}
                export_feeds.append(feed_bucket)
                items = list(entry.get('items') or [])
                if not items:
                    continue
                try:
                    self.progress.emit(_('Building feed %d/%d: %s') % (feed_idx, max(1, total_feeds), feed_title))
                except Exception:
                    pass
                any_items = True
                item_idx = 0
                for it in items:
                    item_idx += 1
                    processed_articles += 1
                    if self._cancelled:
                        error_result = {'error': _('Export cancelled.')}
                        self.result = error_result
                        self.finished.emit(error_result)
                        return

                    try:
                        if total_articles:
                            self.progress.emit(_('Processing article %d/%d...') % (processed_articles, total_articles))
                        else:
                            self.progress.emit(_('Processing article %d...') % processed_articles)
                    except Exception:
                        pass
                    title = it.get('title') or _('(untitled)')
                    link = it.get('link') or ''
                    try:
                        fid_for_base = str((it or {}).get('_feed_id') or str(fid) or '')
                    except Exception:
                        fid_for_base = str(fid) if fid is not None else ''
                    try:
                        feed_url = str((feeds_by_id.get(fid_for_base) or {}).get('url') or '')
                    except Exception:
                        feed_url = ''
                    try:
                        base_url_for_images = str(link or it.get('_feed_url') or feed_url or '')
                    except Exception:
                        base_url_for_images = str(link or feed_url or '')
                    published = it.get('published') or ''
                    try:
                        import html as _html
                        _raw_summary = _html.unescape(it.get('summary') or '')
                    except Exception:
                        _raw_summary = it.get('summary') or ''
                    summary = normalize_summary_to_html(_raw_summary)
                    # Export should try to embed images regardless of the per-feed
                    # preview setting. Network fetching is still controlled by
                    # `export_download_uncached_images` inside export_images.
                    do_download = True

                    if do_download:
                        try:
                            from urllib.parse import urljoin as _urljoin, urlparse as _urlparse
                            enc_urls = []
                            for enc in (it.get('enclosures') or []):
                                try:
                                    if not isinstance(enc, dict):
                                        continue
                                    eurl = (enc.get('url') or '').strip()
                                    etype = (enc.get('type') or '').strip().lower()
                                    if not eurl:
                                        continue
                                    is_img = False
                                    if etype and etype.startswith('image/'):
                                        is_img = True
                                    else:
                                        p = _urlparse(eurl).path or ''
                                        ext = os.path.splitext(p)[1].lower()
                                        if ext in ('.jpg', '.jpeg', '.png', '.gif', '.webp', '.avif', '.bmp', '.svg'):
                                            is_img = True
                                    if not is_img:
                                        continue
                                    try:
                                        if base_url_for_images:
                                            eurl = _urljoin(base_url_for_images, eurl)
                                    except Exception:
                                        pass
                                    try:
                                        eurl = _sanitize_url_for_fetch(eurl)
                                    except Exception:
                                        pass
                                    enc_urls.append(eurl)
                                except Exception:
                                    continue
                            seen = set()
                            enc_urls = [u for u in enc_urls if u and not (u in seen or seen.add(u))]
                            if enc_urls and append_enclosure_images is not None:
                                summary = append_enclosure_images(
                                    summary,
                                    enc_urls[:8],
                                    base_url=base_url_for_images,
                                    sanitize_url=_sanitize_url_for_fetch,
                                )
                        except Exception:
                            pass

                        # Download/relativize images for export (per-article subdir).
                        # This mirrors the single-article export flow so calibre can
                        # package images into the generated EPUB.
                        try:
                            summary = _normalize_images_for_preview(summary, base_url=base_url_for_images or '', preserve_local=False)
                        except Exception:
                            pass
                        try:
                            try:
                                if total_articles:
                                    self.progress.emit(_('Embedding images for article %d/%d...') % (processed_articles, total_articles))
                                else:
                                    self.progress.emit(_('Embedding images...'))
                            except Exception:
                                pass
                            images_subdir = f"feed_{feed_idx - 1}/article_{item_idx - 1}/images"
                            # Enable verbose image-export debugging for this worker run so logs
                            # show cache hits/misses and fetch attempts for each image.
                            try:
                                from calibre_plugins.rss_reader.config import plugin_prefs
                                old_dbg = plugin_prefs.get('debug_export_images', False)
                                old_fetch_dbg = plugin_prefs.get('debug_fetch_images', False)
                                try:
                                    plugin_prefs['debug_export_images'] = True
                                    plugin_prefs['debug_fetch_images'] = True
                                except Exception:
                                    pass
                            except Exception:
                                old_dbg = None
                                old_fetch_dbg = None

                            try:
                                summary = _process_images_for_export(
                                    summary,
                                    base_url=base_url_for_images,
                                    td=td,
                                    do_download=do_download,
                                    images_subdir=images_subdir,
                                )
                            finally:
                                try:
                                    if old_dbg is not None:
                                        plugin_prefs['debug_export_images'] = bool(old_dbg)
                                    if old_fetch_dbg is not None:
                                        plugin_prefs['debug_fetch_images'] = bool(old_fetch_dbg)
                                except Exception:
                                    pass
                        except Exception:
                            pass

                    try:
                        if 'comments_items' not in locals():
                            comments_items = []
                        comments_items.append((title, link))
                    except Exception:
                        pass

                    try:
                        summary = _normalize_images_for_preview(summary, base_url=base_url_for_images or '', preserve_local=True)
                    except Exception:
                        pass

                    feed_bucket['items'].append(
                        {
                            'title': title,
                            'link': link,
                            'published': published,
                            'body_html': summary,
                        }
                    )

                    # Avoid confusing counters (feed_idx*100+item_idx) and keep
                    # progress messages human-readable.
                    pass

            if not any_items:
                error_result = {'error': _('No items to export in the selected feeds.')}
                self.result = error_result
                self.finished.emit(error_result)
                return

            html_path = build_oeb_periodical(td, news_title, export_feeds)

            try:
                comments_html = '<ul>' + ''.join(['<li>%s</li>' % (_htmlmod.escape(t) if t else _('(untitled)')) if not l else '<li><a href="%s">%s</a></li>' % (_htmlmod.escape(l), _htmlmod.escape(t or _('(untitled)'))) for (t, l) in (locals().get('comments_items') or [])]) + '</ul>'
            except Exception:
                comments_html = ''

            # Get conversion settings
            try:
                from calibre_plugins.rss_reader.convert_utils import (
                    get_effective_output_format,
                    get_available_output_formats,
                    resolve_conversion_output,
                )
                desired_fmt = get_effective_output_format(plugin_prefs)
                available_fmts = get_available_output_formats()
                convert_ext, final_ext, markdown_via_txt = resolve_conversion_output(
                    desired_fmt, available_output_formats=available_fmts
                )
            except Exception:
                convert_ext, final_ext, markdown_via_txt = 'epub', 'epub', False

            add_to_library = bool(plugin_prefs.get('export_add_to_library', False))

            if add_to_library:
                out_path = os.path.join(
                    tempfile.gettempdir(),
                    f"{_safe_filename(str(news_title))} - {self.export_data['ts_suffix']}.{convert_ext}",
                )
            else:
                out_path = self.export_data['out_path']

            final_out_path = self.export_data.get('final_out_path')

            self.progress.emit(_('Export preparation complete.'))

            result = {
                'html_path': html_path,
                'td': td,
                'news_title': news_title,
                'comments_html': comments_html,
                'convert_ext': convert_ext,
                'final_ext': final_ext,
                'markdown_via_txt': markdown_via_txt,
                'add_to_library': add_to_library,
                'out_path': out_path,
                'final_out_path': final_out_path,
            }
            self.result = result
            self.finished.emit(result)

        except Exception as e:
            error_result = {'error': str(e)}
            self.result = error_result
            self.finished.emit(error_result)
#!/usr/bin/env python
"""Article fetching and content extraction utilities.

This module provides functions for fetching full article content from URLs,
extracting article text from JSON-LD, and processing HTML for preview/export.

Extracted from ui.py to improve maintainability.
"""


import re
import traceback

try:
    from calibre_plugins.rss_reader.debug import _debug, DEBUG_RSS_READER
except ImportError:
    def _debug(*args, **kwargs):
        pass
    DEBUG_RSS_READER = False

try:
    from calibre_plugins.rss_reader.config import plugin_prefs
except ImportError:
    plugin_prefs = {}

try:
    from calibre_plugins.rss_reader.preview_browser import (
        _sanitize_url_for_fetch,
        _normalize_images_for_preview,
    )
except ImportError:
    def _sanitize_url_for_fetch(url):
        return url
    def _normalize_images_for_preview(html, base_url='', preserve_local=False):
        return html


def _render_fetch_engine_banner(engine_label):
    # Banner removed (too noisy / not useful in practice).
    return ''


def _extract_article_from_jsonld(html_text):
    """Best-effort extraction of full article text from JSON-LD.

    Many news sites embed a NewsArticle/Article with an `articleBody` field in
    JSON-LD. Pulling it often yields full text even when the visible HTML is
    heavily scripted or the RSS contains only a short summary.

    Returns (html_fragment_or_None, image_urls_list).
    """
    import json
    import html as _html

    try:
        text = str(html_text or '')
    except Exception:
        return None, []

    scripts = re.findall(
        r'<script[^>]+type=["\'][^"\']*ld\+json[^"\']*["\'][^>]*>(.*?)</script>',
        text,
        flags=re.I | re.S,
    )
    if not scripts:
        return None, []

    def _iter_objects(obj):
        if isinstance(obj, dict):
            yield obj
            g = obj.get('@graph')
            if isinstance(g, list):
                for x in g:
                    yield from _iter_objects(x)
        elif isinstance(obj, list):
            for x in obj:
                yield from _iter_objects(x)

    def _type_matches(t):
        if not t:
            return False
        if isinstance(t, str):
            return t.lower() in ('newsarticle', 'article', 'report', 'analysis')
        if isinstance(t, list):
            tl = [str(x).lower() for x in t if x]
            return any(x in ('newsarticle', 'article', 'report', 'analysis') for x in tl)
        return False

    dec = json.JSONDecoder()

    def _raw_decode_first(s):
        try:
            s = (s or '').lstrip()
        except Exception:
            s = ''
        if not s:
            return None
        try:
            obj, _idx = dec.raw_decode(s)
            return obj
        except Exception:
            return None

    best_html = None
    best_len = 0
    best_imgs = []

    for raw in scripts:
        raw = (raw or '').strip()
        if not raw:
            continue
        data = _raw_decode_first(raw)
        if data is None:
            continue

        for obj in _iter_objects(data):
            try:
                if not isinstance(obj, dict):
                    continue
                if not _type_matches(obj.get('@type')):
                    continue

                body = obj.get('articleBody') or obj.get('text')
                if not body or not isinstance(body, str):
                    continue

                headline = obj.get('headline') if isinstance(obj.get('headline'), str) else ''
                imgs = []
                img = obj.get('image')
                if isinstance(img, str) and img:
                    imgs.append(img)
                elif isinstance(img, list):
                    imgs.extend([x for x in img if isinstance(x, str) and x])
                elif isinstance(img, dict):
                    u = img.get('url')
                    if isinstance(u, str) and u:
                        imgs.append(u)

                body = body.replace('\r\n', '\n').replace('\r', '\n')
                paras = [p.strip() for p in body.split('\n') if p.strip()]
                body_html = ''.join('<p>%s</p>' % _html.escape(p) for p in paras)
                frag = ('<h2>%s</h2>%s' % (_html.escape(headline), body_html)) if headline else body_html

                plain_len = len(body)
                if plain_len > best_len:
                    best_html = frag
                    best_len = plain_len
                    best_imgs = imgs
            except Exception:
                continue

    if not best_html or best_len < 200:
        return None, []
    return best_html, best_imgs


def _extract_ft_full_html_from_jsonld(raw_html):
    """Mirror calibre's financial_times.recipe JSON-LD extraction.

    FT pages frequently ship the full `articleBody` in JSON-LD.
    This path is fast (no DOM parsing) and yields full text.

    Returns (html_fragment_or_None, image_urls_list).
    """
    import html as _html
    import json
    from urllib.parse import quote

    try:
        raw = str(raw_html or '')
    except Exception:
        return None, []

    # Find JSON-LD blocks and locate a NewsArticle with articleBody.
    scripts = re.findall(
        r'<script[^>]+type=["\'][^"\']*ld\+json[^"\']*["\'][^>]*>(.*?)</script>',
        raw,
        flags=re.I | re.S,
    )
    if not scripts:
        return None, []

    dec = json.JSONDecoder()

    def _iter_objects(obj):
        if isinstance(obj, dict):
            yield obj
            g = obj.get('@graph')
            if isinstance(g, list):
                for x in g:
                    yield from _iter_objects(x)
        elif isinstance(obj, list):
            for x in obj:
                yield from _iter_objects(x)

    def _type_is_newsarticle(t):
        if not t:
            return False
        if isinstance(t, str):
            return t.strip().lower() == 'newsarticle'
        if isinstance(t, list):
            return any(isinstance(x, str) and x.strip().lower() == 'newsarticle' for x in t)
        return False

    data = None
    article = None
    for s in scripts:
        try:
            s = (s or '').lstrip()
        except Exception:
            s = ''
        if not s:
            continue
        try:
            obj, _idx = dec.raw_decode(s)
        except Exception:
            continue
        for o in _iter_objects(obj):
            try:
                if not isinstance(o, dict):
                    continue
                if not _type_is_newsarticle(o.get('@type')):
                    continue
                if not (o.get('articleBody') or o.get('text')):
                    continue
                data = obj
                article = o
                break
            except Exception:
                continue
        if article is not None:
            break

    if not article:
        return None, []

    try:
        title = str(article.get('headline') or article.get('name') or '')
    except Exception:
        title = ''
    try:
        body = str(article.get('articleBody') or article.get('text') or '')
    except Exception:
        body = ''

    # Debug: log body length
    try:
        _debug('FT JSON-LD body length: %d chars' % int(len(body) or 0))
        if 0 < len(body) < 500:
            _debug('FT JSON-LD body preview: %s' % (body[:200],))
    except Exception:
        pass

    if not body or len(body) < 200:
        return None, []

    # Author may be dict or list
    author = ''
    try:
        if 'author' in article:
            a = article.get('author')
            if isinstance(a, dict):
                author = str(a.get('name') or '')
            elif isinstance(a, list):
                names = []
                for x in a:
                    if isinstance(x, dict) and x.get('name'):
                        names.append(str(x.get('name')))
                author = ' and '.join([n for n in names if n])
            elif isinstance(a, str):
                author = a
    except Exception:
        author = ''

    desc = ''
    try:
        if article.get('description'):
            desc = str(article.get('description') or '')
    except Exception:
        desc = ''

    def resize_img(img_url):
        try:
            a = 'https://www.ft.com/__origami/service/image/v2/images/raw/'
            b = quote(str(img_url or ''), safe='')
            c = '?dpr=2&fit=scale-down&quality=medium&source=next&width=400'
            return a + b + c
        except Exception:
            return str(img_url or '')

    images = []

    # Title image
    title_image_html = ''
    try:
        img = article.get('image')
        image_url = None
        if isinstance(img, dict):
            image_url = img.get('url')
        if isinstance(image_url, str) and image_url:
            # If not already referenced in body, include it at top
            if image_url not in body:
                u = resize_img(image_url)
                images.append(u)
                title_image_html = '<p><img src="%s"></p>' % _html.escape(u, quote=True)
    except Exception:
        title_image_html = ''

    # Embedded image markers look like: [https://...]
    try:
        def _img_marker_to_token(mm):
            u = (mm.group(0) or '')[1:-1]
            if not u:
                return ''
            if 'studio' not in u:
                u = resize_img(u)
            images.append(u)
            return '\n\n__FT_IMG__%s\n\n' % u

        body = re.sub(r'\[https://\S+?\]', _img_marker_to_token, body)
    except Exception:
        pass

    # Paragraphize
    try:
        body = body.replace('\r\n', '\n').replace('\r', '\n')
    except Exception:
        pass
    parts = []
    try:
        parts = [p.strip() for p in re.split(r'\n\s*\n+', body) if p.strip()]
    except Exception:
        parts = [body]

    body_html_parts = []
    for p in (parts or []):
        if p.startswith('__FT_IMG__'):
            u = p[len('__FT_IMG__'):].strip()
            if u:
                body_html_parts.append('<p><img src="%s"></p>' % _html.escape(u, quote=True))
            continue
        body_html_parts.append('<p>%s</p>' % _html.escape(p))

    # Build minimal HTML similar to the recipe output
    out = '<div class="body_json">'
    if title:
        out += '<h1>%s</h1>' % _html.escape(title)
    if desc:
        out += '<h2>%s</h2>' % _html.escape(desc)
    if author:
        out += '<h3>%s</h3>' % _html.escape(author)
    if title_image_html:
        out += title_image_html
    out += ''.join(body_html_parts)
    out += '</div>'

    # De-dup images preserving order
    seen = set()
    uniq = []
    for u in images:
        if not u or u in seen:
            continue
        seen.add(u)
        uniq.append(u)
    return out, uniq


def _extract_main_fragment_from_html(text):
    """Legacy hook: intentionally disabled for performance.

    DOM parsing huge pages can spike CPU in preview mode. Keep this function for
    potential future opt-in, but do not use it by default.
    """
    return None


def _fetch_article_content_calibre_recipe_like(url, timeout=None, recipe_urn=''):
    """Recipe-mode full-article fetch.

    Applies matching calibre recipe preprocess_raw_html when available.
    This is what makes FT (and similar) return a full body.
    """
    try:
        from calibre_plugins.rss_reader.rss import fetch_url
        from urllib.parse import urljoin, urlparse
        import os

        tout = int(timeout or plugin_prefs.get('timeout_seconds', 12) or 12)

        # Calibre's financial_times recipe uses a specific UA
        try:
            host = (urlparse(url or '').netloc or '').lower()
        except Exception:
            host = ''
        ua = None
        if (recipe_urn or '').endswith('financial_times') or host.endswith('ft.com'):
            ua = 'Mozilla/5.0 (Java) outbrain'

        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        }
        if ua:
            headers['User-Agent'] = ua

        # For FT, force calibre's mechanize browser (FT often serves different HTML).
        if host.endswith('ft.com'):
            try:
                from calibre_plugins.rss_reader.rss import _mechanize_fetch
                raw, final = _mechanize_fetch(url, timeout_seconds=tout, headers=headers, user_agent=(ua or headers.get('User-Agent')))
            except Exception:
                raw, final = fetch_url(url, timeout_seconds=tout, headers=headers)
        else:
            raw, final = fetch_url(url, timeout_seconds=tout, headers=headers)
        if not raw:
            return '', []

        try:
            text = raw.decode('utf-8')
        except Exception:
            try:
                text = raw.decode('latin-1')
            except Exception:
                text = raw.decode('utf-8', errors='replace')

        def _extract_balanced_element_by_id(html_text, element_id, tag_names=('article', 'div')):
            """Extract the *inner HTML* of the first element with id=element_id.

            Uses a lightweight tag-balance scan (no full DOM parse) to handle nested <div>s.
            Returns '' if not found.
            """
            try:
                s = str(html_text or '')
            except Exception:
                return ''
            if not s or not element_id:
                return ''

            try:
                tag_names_l = [str(t).lower() for t in (tag_names or []) if t]
            except Exception:
                tag_names_l = ['div']
            if not tag_names_l:
                tag_names_l = ['div']

            for tag in tag_names_l:
                try:
                    m = re.search(r'(?is)<%s\b[^>]*\bid\s*=\s*["\']%s["\'][^>]*>' % (re.escape(tag), re.escape(element_id)), s)
                except Exception:
                    m = None
                if not m:
                    continue

                start_tag_end = m.end()
                pos = start_tag_end
                depth = 1
                # Scan for nested/opening and closing tags of this same tag type.
                open_pat = '<' + tag
                close_pat = '</' + tag
                try:
                    s_l = s.lower()
                    while depth > 0:
                        i_open = s_l.find(open_pat, pos)
                        i_close = s_l.find(close_pat, pos)
                        if i_close < 0:
                            return ''
                        if i_open != -1 and i_open < i_close:
                            depth += 1
                            pos = i_open + len(open_pat)
                            continue
                        depth -= 1
                        # Move pos past the closing tag
                        j = s_l.find('>', i_close)
                        if j < 0:
                            return ''
                        pos = j + 1
                    inner = s[start_tag_end:i_close]
                    return inner
                except Exception:
                    return ''
            return ''

        # Fast path for FT: mirror the recipe's preprocess_raw_html logic
        try:
            if (recipe_urn or '').endswith('financial_times') or host.endswith('ft.com'):
                # First check if we already have full article HTML (recipe does this)
                if 'id="article-body"' in text or "id='article-body'" in text:
                    try:
                        article_body = _extract_balanced_element_by_id(text, 'article-body', tag_names=('article', 'div'))
                    except Exception:
                        article_body = ''
                    if article_body and len(article_body) > 500:
                        try:
                            _debug('FT: extracted #article-body inner HTML len=%d' % int(len(article_body) or 0))
                        except Exception:
                            pass
                        try:
                            frag = _normalize_images_for_preview(article_body, base_url=(final or url), preserve_local=False)
                        except Exception:
                            frag = article_body
                        imgs = []
                        try:
                            for mm in re.finditer(r'(?is)<img[^>]+(?:src|data-src)=\s*["\']([^"\']+)["\']', frag):
                                imgs.append(mm.group(1))
                        except Exception:
                            pass
                        return frag, imgs

                # Otherwise fall back to JSON-LD extraction
                ft_html, ft_imgs = _extract_ft_full_html_from_jsonld(text)
                if ft_html and len(ft_html) > 500:
                    try:
                        ft_html = _normalize_images_for_preview(ft_html, base_url=(final or url), preserve_local=False)
                    except Exception:
                        pass
                    return ft_html, (ft_imgs or [])
        except Exception:
            try:
                _debug('FT extraction failed: %s' % (traceback.format_exc()[:800],))
            except Exception:
                pass

        # Generic: try JSON-LD next
        # (Still much cheaper than DOM parsing and works on many sites)
        try:
            jhtml, jimgs = _extract_article_from_jsonld(text)
            if jhtml:
                try:
                    jhtml = _normalize_images_for_preview(jhtml, base_url=(final or url), preserve_local=False)
                except Exception:
                    pass
                return jhtml, (jimgs or [])
        except Exception:
            pass

        # Final fallback: avoid heavy DOM parsing; use simple paragraph extraction
        ps = []
        try:
            ps = re.findall(r'(?is)(<p[^>]*>.*?</p>)', text)
        except Exception:
            ps = []
        frag = '\n'.join(ps[:12]) if ps else (text[:9000])

        # Remove scripts/styles from fragment
        try:
            frag = re.sub(r'(?is)<script.*?>.*?</script>', '', frag)
            frag = re.sub(r'(?is)<style.*?>.*?</style>', '', frag)
        except Exception:
            pass

        # Resolve image URLs
        imgs = []
        try:
            for mm in re.finditer(r'(?is)<img[^>]+(?:src|data-src|data-original|data-lazy-src)=\s*["\']([^"\']+)["\']', frag):
                src = mm.group(1)
                if not src:
                    continue
                try:
                    full = urljoin(final or url, src)
                    full = _sanitize_url_for_fetch(full)
                    imgs.append(full)
                except Exception:
                    continue
        except Exception:
            imgs = []

        try:
            frag = _normalize_images_for_preview(frag, base_url=(final or url), preserve_local=False)
        except Exception:
            pass

        seen = set()
        uniq = []
        for u in imgs:
            if not u or u in seen:
                continue
            seen.add(u)
            uniq.append(u)
        return frag, uniq
    except Exception:
        return '', []


def _fetch_article_content(url, timeout=None):
    """Conservative best-effort article HTML extractor.

    Returns (html_fragment, [image_urls]). Uses current fetch_url helper and
    simple heuristics (article tag, common content divs, or first <p> tags).
    """
    try:
        from calibre_plugins.rss_reader.rss import fetch_url
        from urllib.parse import urljoin
        tout = int(timeout or plugin_prefs.get('timeout_seconds', 12) or 12)
        raw, final = fetch_url(url, timeout_seconds=tout)
        if not raw:
            return '', []
        # Try decoding as UTF-8 then fall back
        try:
            text = raw.decode('utf-8')
        except Exception:
            try:
                text = raw.decode('latin-1')
            except Exception:
                text = raw.decode('utf-8', errors='replace')

        # Try JSON-LD first (often contains full `articleBody` even for scripted sites)
        try:
            jhtml, jimgs = _extract_article_from_jsonld(text)
            if jhtml:
                try:
                    frag = _normalize_images_for_preview(jhtml, base_url=(final or url), preserve_local=False)
                except Exception:
                    frag = jhtml
                uniq = []
                seen = set()
                for u in (jimgs or []):
                    try:
                        u = _sanitize_url_for_fetch(u)
                    except Exception:
                        pass
                    if not u or u in seen:
                        continue
                    seen.add(u)
                    uniq.append(u)
                return frag, uniq
        except Exception:
            pass

        # Avoid heavy DOM parsing in the preview path.
        # Prefer explicit <article>, otherwise just take the first N <p> blocks.
        m = re.search(r'(?is)<article[^>]*>(.*?)</article>', text)
        if m:
            body = m.group(1)
        else:
            ps = re.findall(r'(?is)(<p[^>]*>.*?</p>)', text)
            if ps:
                body = '\n'.join(ps[:12])
            else:
                m = re.search(r'(?is)<body[^>]*>(.*?)</body>', text)
                body = m.group(1) if m else text[:6000]

        # Remove scripts/styles
        body = re.sub(r'(?is)<script.*?>.*?</script>', '', body)
        body = re.sub(r'(?is)<style.*?>.*?</style>', '', body)

        # Resolve image URLs
        imgs = []
        try:
            for mm in re.finditer(r'(?is)<img[^>]+(?:src|data-src|data-original|data-lazy-src)=\s*["\']([^"\']+)["\']', body):
                src = mm.group(1)
                if not src:
                    continue
                try:
                    full = urljoin(final or url, src)
                    full = _sanitize_url_for_fetch(full)
                    imgs.append(full)
                except Exception:
                    continue
        except Exception:
            imgs = []

        # Diagnostic: if no images were found, log a short snippet of the fetched
        # body and the final URL to help diagnose feed-specific extraction issues.
        try:
            from calibre_plugins.rss_reader.config import plugin_prefs
            debug_fetch = bool(plugin_prefs.get('debug_fetch_images', False))
        except Exception:
            debug_fetch = False
        if debug_fetch and not imgs:
            try:
                import logging as _logging
                snippet = (body or '')[:600]
                _logging.warning('fetch_article: no imgs for %r final_url=%r snippet=%r', url, final, snippet)
            except Exception:
                pass

        # Normalize image attributes (resolve relative URLs etc.)
        try:
            frag = _normalize_images_for_preview(body, base_url=(final or url), preserve_local=False)
        except Exception:
            frag = body

        # Deduplicate images preserving order
        seen = set()
        uniq = []
        for u in imgs:
            if not u or u in seen:
                continue
            seen.add(u)
            uniq.append(u)

        return frag, uniq
    except Exception:
        return '', []


def _fetch_article_content_calibre_readability(url, timeout=None):
    """Fetch article HTML and extract main content using calibre's readability.

    Returns (html_fragment, [image_urls]). Falls back to ('', []) on errors.
    """
    try:
        from calibre_plugins.rss_reader.rss import fetch_url
        from urllib.parse import urljoin
        tout = int(timeout or plugin_prefs.get('timeout_seconds', 12) or 12)
        raw, final = fetch_url(url, timeout_seconds=tout)
        if not raw:
            return '', []

        try:
            text = raw.decode('utf-8')
        except Exception:
            try:
                text = raw.decode('latin-1')
            except Exception:
                text = raw.decode('utf-8', errors='replace')

        class _NullLog(object):
            def debug(self, *a, **k):
                return None

            def exception(self, *a, **k):
                return None

        try:
            from calibre.ebooks.readability import readability
            doc = readability.Document(text, _NullLog(), url=(final or url))
            article_html = doc.summary()
        except Exception:
            return '', []

        # Resolve image URLs
        imgs = []
        try:
            for mm in re.finditer(r'(?is)<img[^>]+(?:src|data-src|data-original|data-lazy-src)=\s*["\']([^"\']+)["\']', article_html):
                src = mm.group(1)
                if not src:
                    continue
                try:
                    full = urljoin(final or url, src)
                    full = _sanitize_url_for_fetch(full)
                    imgs.append(full)
                except Exception:
                    continue
        except Exception:
            imgs = []

        try:
            frag = _normalize_images_for_preview(article_html, base_url=(final or url), preserve_local=False)
        except Exception:
            frag = article_html

        seen = set()
        uniq = []
        for u in imgs:
            if not u or u in seen:
                continue
            seen.add(u)
            uniq.append(u)
        return frag, uniq
    except Exception:
        return '', []


def _sanitize_article_html_for_embed(article_html):
    """Make fetched article HTML safe to embed inside our preview HTML.

    - If the fetched content is a full HTML document, extract the <body>.
    - If the content is plain text or has no block structure, preserve newlines.
    - Ensure block-level elements have proper spacing/margins for multi-paragraph display.
    """
    import html as _html

    try:
        s = article_html
        if s is None:
            return ''
        s = str(s)
    except Exception:
        return ''

    # Extract <body> if present (avoid nesting full HTML documents inside preview)
    try:
        m = re.search(r'(?is)<body[^>]*>(.*?)</body>', s)
        if m:
            s = m.group(1)
        else:
            # If it looks like a full doc, strip outer html/head tags conservatively
            if re.search(r'(?is)<html\b', s):
                s = re.sub(r'(?is)^.*?<html\b[^>]*>', '', s)
                s = re.sub(r'(?is)</html>.*$', '', s)
            s = re.sub(r'(?is)<head\b[^>]*>.*?</head>', '', s)
    except Exception:
        pass

    # If it's plain text (no tags), escape and preserve newlines.
    try:
        if '<' not in s and '>' not in s:
            s = _html.escape(s)
            return '<div style="white-space:pre-wrap;line-height:1.5">%s</div>' % s
    except Exception:
        pass

    # Ensure block elements have proper line-height and margins for multi-paragraph rendering.
    # This prevents QTextBrowser from collapsing multiple paragraphs into a single line.
    try:
        # If there are paragraph/div/list elements, ensure they have spacing.
        if re.search(r'(?is)<\s*(p|div|li|h\d|blockquote)\b', s):
            # Wrap the entire content in a div with explicit line-height and margins
            # to force proper multi-paragraph rendering.
            return '<div style="line-height:1.6;margin:0;padding:0">%s</div>' % s
    except Exception:
        pass

    # If it has no obvious paragraph/block structure, preserve newlines.
    try:
        has_blocks = bool(re.search(r'(?is)<\s*(p|div|br|ul|ol|li|h\d|blockquote)\b', s))
        if not has_blocks and ('\n' in s or '\r' in s):
            # Preserve newlines without breaking existing inline HTML.
            return '<div style="white-space:pre-wrap;line-height:1.5">%s</div>' % s
    except Exception:
        pass

    return s
