Working with Youtube video using Python

Anh-Thi Dinh
Vietnamese
Mong muốn:
  • Download video youtube chỉ bằng python (video public)
  • Check file size trước khi download (không cần download trước)
  • Check duration của video (mà không cần download trước)
  • Chỉ download audio (với file size nhỏ nhất có thể)
Ngon nhất là package yt-dlp.

AIO metadata

1def get_smallest_webm_audio(url):
2    try:
3        # Run yt-dlp with -J to get video metadata
4        result = subprocess.run(
5            ["yt-dlp", "-J", url],
6            capture_output=True,
7            text=True,
8            check=True
9        )
10        # Parse the JSON output
11        metadata = json.loads(result.stdout)
12        formats = metadata.get("formats", [])
13        
14        # Filter for webm audio-only formats with filesize
15        webm_audio_formats = [
16            {
17                "format_id": f.get("format_id"),
18                "filesize": f.get("filesize"),
19                "duration": metadata.get("duration"),
20                "title": metadata.get("title"),
21                "ext": f.get("ext")
22            }
23            for f in formats
24            if f.get("ext") == "webm" and f.get("vcodec") == "none" and f.get("filesize") is not None
25        ]
26        
27        # Find the format with the smallest filesize
28        if webm_audio_formats:
29            smallest = min(webm_audio_formats, key=lambda x: x["filesize"])
30            return smallest
31        else:
32            return None
33    except subprocess.CalledProcessError as e:
34        print(f"Error running yt-dlp: {e.stderr}")
35        return None
36    except json.JSONDecodeError:
37        print("Error decoding JSON output from yt-dlp.")
38        return None
39    
40url = "https://www.youtube.com/watch?v=ry9SYnV3svc"  # Replace with your video URL
41smallest_webm_audio = get_smallest_webm_audio(url)
42if smallest_webm_audio:
43    print("Smallest webm audio format:")
44    print(smallest_webm_audio)
45else:
46    print("No webm audio format with filesize found.")

Get video size

1import yt_dlp
2
3def get_youtube_video_size_mb(video_url):
4    """Get the file size of a YouTube video in MB."""
5    try:
6        ydl_opts = {"listformats": True, "quiet": True}
7
8        with YoutubeDL(ydl_opts) as ydl:
9            result = ydl.extract_info(video_url, download=False)
10            formats = result.get("formats", [])
11
12            # Iterate through formats to find the first valid "audio only" entry with filesize
13            for f in formats:
14                if (
15                    f.get("vcodec") == "none"  # Indicates "audio only"
16                    and f.get("filesize") is not None
17                ):
18                    # return f.get("format_id")
19                    return f.get("filesize") / (1024 * 1024)  # Convert to MB
20
21        return None
22    except Exception:
23        return None

Get video duration

1import yt_dlp
2
3def get_youtube_video_duration_s(video_url):
4    """Get the duration of a YouTube video in seconds."""
5    try:
6        ydl_opts = {"quiet": True}
7
8        with YoutubeDL(ydl_opts) as ydl:
9            result = ydl.extract_info(video_url, download=False)
10            return result.get("duration")
11    except Exception:
12        return None

Download

Không thể download bằng python package yt-dlp được, toàn ra lỗi 403
1An error occurred: ERROR: unable to download video data: HTTP Error 403: Forbidden
Cách hay nhất để download là cũng dùng yt-dlp nhưng là dùng CLI của nó!
1import subprocess
2import json
3
4def download_yt(url, output_dir="."):
5    # Get video formats info
6    cmd = ["yt-dlp", "-J", url]
7    result = subprocess.run(cmd, capture_output=True, text=True)
8    formats = json.loads(result.stdout)["formats"]
9    
10    # Find audio format with smallest filesize
11    audio_formats = [f for f in formats if f.get("vcodec") == "none"]
12    best_format = min(audio_formats, key=lambda x: x.get("filesize", float("inf")))
13    format_id = best_format["format_id"]
14    print("👉👉👉 format_id: ", format_id)
15    
16    # Download with selected format
17    cmd = [
18        "yt-dlp",
19        "-f", format_id,
20        "-o", f"{output_dir}/%(title)s.%(ext)s",
21        url
22    ]
23    subprocess.run(cmd)
24
25# Usage
26download_yt("https://www.youtube.com/watch?v=ry9SYnV3svc", "../fake_storage_account/audios/audioDirectory")