diff --git a/backend/app/services/downloader.py b/backend/app/services/downloader.py index 5431868..9cf7f6b 100644 --- a/backend/app/services/downloader.py +++ b/backend/app/services/downloader.py @@ -77,11 +77,13 @@ VIDEO_BASE_PATH = os.getenv("VIDEO_BASE_PATH", "/home/xdl/xdl_videos") X_VIDEOS_PATH = os.path.join(VIDEO_BASE_PATH, "x_videos") YOUTUBE_VIDEOS_PATH = os.path.join(VIDEO_BASE_PATH, "youtube_videos") PH_VIDEOS_PATH = os.path.join(VIDEO_BASE_PATH, "ph_videos") +HLS_VIDEOS_PATH = os.path.join(VIDEO_BASE_PATH, "hls_videos") # Ensure directories exist os.makedirs(X_VIDEOS_PATH, exist_ok=True) os.makedirs(YOUTUBE_VIDEOS_PATH, exist_ok=True) os.makedirs(PH_VIDEOS_PATH, exist_ok=True) +os.makedirs(HLS_VIDEOS_PATH, exist_ok=True) # Pattern to match YouTube URLs YOUTUBE_URL_RE = re.compile( @@ -99,12 +101,20 @@ PORNHUB_URL_RE = re.compile( r'|https?://phub\.to/[\w-]+' ) +# Pattern to match HLS / m3u8 URLs (direct stream links) +HLS_URL_RE = re.compile( + r'https?://[^\s]+\.m3u8(?:[?#][^\s]*)?', + re.IGNORECASE, +) + def get_video_path(filename: str, platform: str = "twitter") -> str: if platform == "youtube": return os.path.join(YOUTUBE_VIDEOS_PATH, filename) if platform == "pornhub": return os.path.join(PH_VIDEOS_PATH, filename) + if platform == "hls": + return os.path.join(HLS_VIDEOS_PATH, filename) return os.path.join(X_VIDEOS_PATH, filename) @@ -124,6 +134,8 @@ def detect_platform(url: str) -> str: return "youtube" if _is_pornhub_url(url): return "pornhub" + if _is_hls_url(url): + return "hls" return "unknown" @@ -131,6 +143,10 @@ def _is_twitter_url(url: str) -> bool: return bool(TWITTER_URL_RE.match(url)) +def _is_hls_url(url: str) -> bool: + return bool(HLS_URL_RE.match(url)) + + def _extract_tweet_id(url: str) -> Optional[str]: m = TWITTER_URL_RE.match(url) return m.group(1) if m else None @@ -496,6 +512,106 @@ def _download_pornhub_video(url: str, format_id: str = "best", progress_callback } +def _parse_hls_video(url: str) -> dict: + """Parse HLS/m3u8 stream info using yt-dlp.""" + ydl_opts = { + "quiet": True, + "no_warnings": True, + "skip_download": True, + "allowed_extractors": ["generic"], + } + try: + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=False) + except Exception: + # If yt-dlp can't parse, return minimal info to allow direct download + return { + "title": "HLS Stream", + "thumbnail": "", + "duration": 0, + "formats": [{"format_id": "best", "quality": "best", "ext": "mp4", "filesize": 0, "note": "HLS stream (auto-merge)"}], + "url": url, + "platform": "hls", + } + + formats = [] + seen = set() + for f in (info.get("formats") or []): + if f.get("vcodec", "none") == "none": + continue + height = f.get("height", 0) + fmt_id = f.get("format_id", "") + quality = f"{height}p" if height else f.get("format_note", "HLS") + key = quality + if key in seen: + continue + seen.add(key) + formats.append({ + "format_id": fmt_id, + "quality": quality, + "ext": "mp4", + "filesize": f.get("filesize") or f.get("filesize_approx") or 0, + "note": f.get("format_note", "HLS"), + }) + + formats.sort(key=lambda x: int(x["quality"].replace("p", "")) if x["quality"].endswith("p") else 0, reverse=True) + formats.insert(0, {"format_id": "best", "quality": "best", "ext": "mp4", "filesize": 0, "note": "Best available quality"}) + + return { + "title": info.get("title") or "HLS Stream", + "thumbnail": info.get("thumbnail", ""), + "duration": info.get("duration", 0) or 0, + "formats": formats, + "url": url, + "platform": "hls", + } + + +def _download_hls_video(url: str, format_id: str = "best", progress_callback=None, task_id: str = None) -> dict: + """Download HLS/m3u8 stream using yt-dlp (handles segment merge automatically).""" + uid = str(uuid.uuid4())[:8] + output_template = os.path.join(HLS_VIDEOS_PATH, f"hls_{uid}.%(ext)s") + + if format_id == "best": + format_spec = "bestvideo[ext=mp4]+bestaudio[ext=m4a]/bestvideo+bestaudio/best[ext=mp4]/best" + else: + format_spec = f"{format_id}+bestaudio/{format_id}/best" + + hooks = [_make_hook(task_id)] if task_id else [] + + ydl_opts = { + "format": format_spec, + "outtmpl": output_template, + "merge_output_format": "mp4", + "quiet": True, + "no_warnings": True, + "progress_hooks": hooks, + "allowed_extractors": ["generic", "m3u8"], + # HLS-specific: concurrent fragment download for speed + "concurrent_fragment_downloads": 5, + } + + with yt_dlp.YoutubeDL(ydl_opts) as ydl: + info = ydl.extract_info(url, download=True) + filename = ydl.prepare_filename(info) + if not os.path.exists(filename): + base = os.path.splitext(filename)[0] + filename = base + ".mp4" + + file_size = os.path.getsize(filename) if os.path.exists(filename) else 0 + title = info.get("title") or "HLS Stream" + + return { + "title": title, + "thumbnail": info.get("thumbnail", ""), + "duration": info.get("duration", 0) or 0, + "filename": os.path.basename(filename), + "file_path": filename, + "file_size": file_size, + "platform": "hls", + } + + def parse_video_url(url: str) -> dict: """Extract video info without downloading.""" # Use syndication API for Twitter/X URLs @@ -526,6 +642,11 @@ def parse_video_url(url: str) -> dict: logger.info(f"Parsing Pornhub video: {url}") return _parse_pornhub_video(url) + # HLS / m3u8 direct stream URLs + if _is_hls_url(url): + logger.info(f"Parsing HLS stream: {url}") + return _parse_hls_video(url) + # Fallback to generic yt-dlp ydl_opts = { "quiet": True, @@ -604,6 +725,11 @@ def download_video(url: str, format_id: str = "best", progress_callback=None, ta logger.info(f"Downloading Pornhub video: {url}") return _download_pornhub_video(url, format_id, progress_callback, task_id=task_id) + # HLS / m3u8 direct stream URLs + if _is_hls_url(url): + logger.info(f"Downloading HLS stream: {url}") + return _download_hls_video(url, format_id, progress_callback, task_id=task_id) + task_id = str(uuid.uuid4())[:8] output_template = os.path.join(X_VIDEOS_PATH, f"%(id)s_{task_id}.%(ext)s")