feat: support HLS/m3u8 stream download

2026-03-17 20:11:34 +08:00
parent f03fae2e2e
commit b76f0aa1f6
1 changed files with 126 additions and 0 deletions
--- a/backend/app/services/downloader.py
+++ b/backend/app/services/downloader.py
@@ -77,11 +77,13 @@ VIDEO_BASE_PATH = os.getenv("VIDEO_BASE_PATH", "/home/xdl/xdl_videos")
 X_VIDEOS_PATH = os.path.join(VIDEO_BASE_PATH, "x_videos")
 YOUTUBE_VIDEOS_PATH = os.path.join(VIDEO_BASE_PATH, "youtube_videos")
 PH_VIDEOS_PATH = os.path.join(VIDEO_BASE_PATH, "ph_videos")
+HLS_VIDEOS_PATH = os.path.join(VIDEO_BASE_PATH, "hls_videos")

 # Ensure directories exist
 os.makedirs(X_VIDEOS_PATH, exist_ok=True)
 os.makedirs(YOUTUBE_VIDEOS_PATH, exist_ok=True)
 os.makedirs(PH_VIDEOS_PATH, exist_ok=True)
+os.makedirs(HLS_VIDEOS_PATH, exist_ok=True)

 # Pattern to match YouTube URLs
 YOUTUBE_URL_RE = re.compile(
@@ -99,12 +101,20 @@ PORNHUB_URL_RE = re.compile(
    r'|https?://phub\.to/[\w-]+'
 )

+# Pattern to match HLS / m3u8 URLs (direct stream links)
+HLS_URL_RE = re.compile(
+    r'https?://[^\s]+\.m3u8(?:[?#][^\s]*)?',
+    re.IGNORECASE,
+)
+

 def get_video_path(filename: str, platform: str = "twitter") -> str:
    if platform == "youtube":
        return os.path.join(YOUTUBE_VIDEOS_PATH, filename)
    if platform == "pornhub":
        return os.path.join(PH_VIDEOS_PATH, filename)
+    if platform == "hls":
+        return os.path.join(HLS_VIDEOS_PATH, filename)
    return os.path.join(X_VIDEOS_PATH, filename)


@@ -124,6 +134,8 @@ def detect_platform(url: str) -> str:
        return "youtube"
    if _is_pornhub_url(url):
        return "pornhub"
+    if _is_hls_url(url):
+        return "hls"
    return "unknown"


@@ -131,6 +143,10 @@ def _is_twitter_url(url: str) -> bool:
    return bool(TWITTER_URL_RE.match(url))


+def _is_hls_url(url: str) -> bool:
+    return bool(HLS_URL_RE.match(url))
+
+
 def _extract_tweet_id(url: str) -> Optional[str]:
    m = TWITTER_URL_RE.match(url)
    return m.group(1) if m else None
@@ -496,6 +512,106 @@ def _download_pornhub_video(url: str, format_id: str = "best", progress_callback
    }


+def _parse_hls_video(url: str) -> dict:
+    """Parse HLS/m3u8 stream info using yt-dlp."""
+    ydl_opts = {
+        "quiet": True,
+        "no_warnings": True,
+        "skip_download": True,
+        "allowed_extractors": ["generic"],
+    }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=False)
+    except Exception:
+        # If yt-dlp can't parse, return minimal info to allow direct download
+        return {
+            "title": "HLS Stream",
+            "thumbnail": "",
+            "duration": 0,
+            "formats": [{"format_id": "best", "quality": "best", "ext": "mp4", "filesize": 0, "note": "HLS stream (auto-merge)"}],
+            "url": url,
+            "platform": "hls",
+        }
+
+    formats = []
+    seen = set()
+    for f in (info.get("formats") or []):
+        if f.get("vcodec", "none") == "none":
+            continue
+        height = f.get("height", 0)
+        fmt_id = f.get("format_id", "")
+        quality = f"{height}p" if height else f.get("format_note", "HLS")
+        key = quality
+        if key in seen:
+            continue
+        seen.add(key)
+        formats.append({
+            "format_id": fmt_id,
+            "quality": quality,
+            "ext": "mp4",
+            "filesize": f.get("filesize") or f.get("filesize_approx") or 0,
+            "note": f.get("format_note", "HLS"),
+        })
+
+    formats.sort(key=lambda x: int(x["quality"].replace("p", "")) if x["quality"].endswith("p") else 0, reverse=True)
+    formats.insert(0, {"format_id": "best", "quality": "best", "ext": "mp4", "filesize": 0, "note": "Best available quality"})
+
+    return {
+        "title": info.get("title") or "HLS Stream",
+        "thumbnail": info.get("thumbnail", ""),
+        "duration": info.get("duration", 0) or 0,
+        "formats": formats,
+        "url": url,
+        "platform": "hls",
+    }
+
+
+def _download_hls_video(url: str, format_id: str = "best", progress_callback=None, task_id: str = None) -> dict:
+    """Download HLS/m3u8 stream using yt-dlp (handles segment merge automatically)."""
+    uid = str(uuid.uuid4())[:8]
+    output_template = os.path.join(HLS_VIDEOS_PATH, f"hls_{uid}.%(ext)s")
+
+    if format_id == "best":
+        format_spec = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio/best[ext=mp4]/best"
+    else:
+        format_spec = f"{format_id}+bestaudio/{format_id}/best"
+
+    hooks = [_make_hook(task_id)] if task_id else []
+
+    ydl_opts = {
+        "format": format_spec,
+        "outtmpl": output_template,
+        "merge_output_format": "mp4",
+        "quiet": True,
+        "no_warnings": True,
+        "progress_hooks": hooks,
+        "allowed_extractors": ["generic", "m3u8"],
+        # HLS-specific: concurrent fragment download for speed
+        "concurrent_fragment_downloads": 5,
+    }
+
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+        filename = ydl.prepare_filename(info)
+        if not os.path.exists(filename):
+            base = os.path.splitext(filename)[0]
+            filename = base + ".mp4"
+
+    file_size = os.path.getsize(filename) if os.path.exists(filename) else 0
+    title = info.get("title") or "HLS Stream"
+
+    return {
+        "title": title,
+        "thumbnail": info.get("thumbnail", ""),
+        "duration": info.get("duration", 0) or 0,
+        "filename": os.path.basename(filename),
+        "file_path": filename,
+        "file_size": file_size,
+        "platform": "hls",
+    }
+
+
 def parse_video_url(url: str) -> dict:
    """Extract video info without downloading."""
    # Use syndication API for Twitter/X URLs
@@ -526,6 +642,11 @@ def parse_video_url(url: str) -> dict:
        logger.info(f"Parsing Pornhub video: {url}")
        return _parse_pornhub_video(url)

+    # HLS / m3u8 direct stream URLs
+    if _is_hls_url(url):
+        logger.info(f"Parsing HLS stream: {url}")
+        return _parse_hls_video(url)
+
    # Fallback to generic yt-dlp
    ydl_opts = {
        "quiet": True,
@@ -604,6 +725,11 @@ def download_video(url: str, format_id: str = "best", progress_callback=None, ta
        logger.info(f"Downloading Pornhub video: {url}")
        return _download_pornhub_video(url, format_id, progress_callback, task_id=task_id)

+    # HLS / m3u8 direct stream URLs
+    if _is_hls_url(url):
+        logger.info(f"Downloading HLS stream: {url}")
+        return _download_hls_video(url, format_id, progress_callback, task_id=task_id)
+
    task_id = str(uuid.uuid4())[:8]
    output_template = os.path.join(X_VIDEOS_PATH, f"%(id)s_{task_id}.%(ext)s")