__license__   = 'GPL v3'
__copyright__ = '2026, Comfy.n'
__docformat__ = 'restructuredtext en'

# Tagging and text processing utilities for RSS Reader
import re
from calibre_plugins.rss_reader.rss import normalize_summary_to_html
from calibre_plugins.rss_reader.config import plugin_prefs

def auto_tags_for_item(item_dict):
    tags = set()
    try:
        if not bool(plugin_prefs.get('auto_tagging_enabled', True)):
            return tags
    except Exception:
        return tags
    try:
        do_img = bool(plugin_prefs.get('auto_tag_img', True))
    except Exception:
        do_img = True
    try:
        do_audio = bool(plugin_prefs.get('auto_tag_audio', True))
    except Exception:
        do_audio = True
    try:
        do_long = bool(plugin_prefs.get('auto_tag_long', True))
    except Exception:
        do_long = True
    if not do_img and not do_audio and not do_long:
        return tags
    try:
        summary = item_dict.get('summary') or ''
    except Exception:
        summary = ''
    try:
        summary_html = normalize_summary_to_html(summary) if summary else ''
    except Exception:
        summary_html = str(summary or '')
    s_lower = (summary_html or '').lower()
    # img: <img> tag or image-like enclosure
    if do_img:
        has_img = ('<img' in s_lower)
        if not has_img:
            try:
                for e in (item_dict.get('enclosures') or []):
                    if not isinstance(e, dict):
                        continue
                    u = str(e.get('url') or '').lower()
                    t = str(e.get('type') or '').lower()
                    if t.startswith('image/'):
                        has_img = True
                        break
                    if u.endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp', '.avif', '.svg')):
                        has_img = True
                        break
            except Exception:
                pass
        if has_img:
            try:
                tags.add('img')
            except Exception:
                pass

    # audio: enclosure type audio/* or common audio file extension
    if do_audio:
        has_audio = False
        try:
            for e in (item_dict.get('enclosures') or []):
                if not isinstance(e, dict):
                    continue
                u = str(e.get('url') or '').lower()
                t = str(e.get('type') or '').lower()
                if t.startswith('audio/'):
                    has_audio = True
                    break
                if u.endswith(('.mp3', '.m4a', '.aac', '.ogg', '.opus', '.wav', '.flac', '.m4b')):
                    has_audio = True
                    break
        except Exception:
            has_audio = False
        if has_audio:
            try:
                tags.add('audio')
            except Exception:
                pass
    # long: word-count threshold only
    if do_long:
        try:
            word_threshold = int(plugin_prefs.get('auto_tag_long_words', 300) or 300)
        except Exception:
            word_threshold = 300
        if word_threshold and word_threshold > 0:
            try:
                text = strip_html_to_text(summary_html)
            except Exception:
                text = str(summary_html or '')
            try:
                word_count = len(re.findall(r"\w+", text or ''))
            except Exception:
                word_count = 0
            if word_count >= word_threshold:
                try:
                    tags.add('long')
                except Exception:
                    pass
    return tags

def strip_html_to_text(html):
    # Simple HTML to text for word counting
    return re.sub('<[^<]+?>', '', html or '')

# Add more tagging-related functions as needed, e.g. auto_tags_for_feed, manual_tags_for_feed, etc.
