youtubeのurlから、自動生成された字幕ファイルを取得(プレーンテキストにも変換)するpythonスクリプト

自動字幕（ja-auto）はYouTubeTranscriptApiでは取得できないらしく、yt-dlp を使った。

pip install yt-dlp

1	pip install yt-dlp

subtitles.ja.srt　←　字幕ファイル
subtitles.txt　←　プレーンテキストファイル
の2種類が生成される。

import subprocess
import os
import re

def clean_html_tags(text):
    """HTMLタグ（特に<font>タグ）を削除する"""
    return re.sub(r'<[^>]+>', '', text)  # すべてのHTMLタグを削除

def get_youtube_subtitles(video_url, output_srt="subtitles.srt", output_txt="subtitles.txt"):
    """YouTube動画の自動生成された日本語字幕を取得し、プレーンテキストに変換する"""

    # yt-dlp で SRT 字幕を取得
    command = [
        "yt-dlp",
        "--write-auto-sub",  # 自動生成字幕を取得
        "--sub-langs", "ja",  # 日本語字幕を指定
        "--sub-format", "ttml",  # 字幕の間隔調整（これを指定しないと重複する！）
        "--convert-subs", "srt",  # SRT形式で取得
        "--skip-download",  # 動画はダウンロードしない
        "--output", "subtitles",  # 出力ファイル名（拡張子なし）
        video_url
    ]

    subprocess.run(command, check=True)

    # ダウンロードされた字幕ファイル
    subtitle_file = "subtitles.ja.srt"

    if os.path.exists(subtitle_file):
        # SRT をプレーンテキストに変換
        with open(subtitle_file, "r", encoding="utf-8") as f:
            lines = f.readlines()

        text_lines = []
        previous_line = None  # 直前の行を記録する変数

        for line in lines:
            stripped_line = clean_html_tags(line.strip())  # HTMLタグを削除
            if not re.match(r"^\d+$", stripped_line) and "-->" not in stripped_line:  # 字幕番号とタイムスタンプを削除
                if stripped_line and stripped_line != previous_line:  # 直前の行と異なる場合のみ追加
                    text_lines.append(stripped_line)
                previous_line = stripped_line  # 現在の行を記録

        # 空行を削除
        text_lines = [line for line in text_lines if line]

        # プレーンテキストファイルに保存
        with open(output_txt, "w", encoding="utf-8") as f:
            f.write("\n".join(text_lines))

        print(f"プレーンテキスト字幕を {output_txt} に保存しました。")
        return "\n".join(text_lines)
    else:
        print("字幕が取得できませんでした。")
        return None

# YouTube動画のURL
video_url = "https://www.youtube.com/watch?v=xxxxxxxxx"

# 字幕を取得
subtitles = get_youtube_subtitles(video_url)

if subtitles:
    print(subtitles)
else:
    print("字幕が取得できませんでした。")

import subprocess

import os

import re

def clean_html_tags(text):

"""HTMLタグ（特に<font>タグ）を削除する"""

return re.sub(r'<[^>]+>', '', text) # すべてのHTMLタグを削除

def get_youtube_subtitles(video_url, output_srt="subtitles.srt", output_txt="subtitles.txt"):

"""YouTube動画の自動生成された日本語字幕を取得し、プレーンテキストに変換する"""

# yt-dlp で SRT 字幕を取得

command = [

"yt-dlp",

"--write-auto-sub", # 自動生成字幕を取得

"--sub-langs", "ja", # 日本語字幕を指定

"--sub-format", "ttml", # 字幕の間隔調整（これを指定しないと重複する！）

"--convert-subs", "srt", # SRT形式で取得

"--skip-download", # 動画はダウンロードしない

"--output", "subtitles", # 出力ファイル名（拡張子なし）

video_url

]

subprocess.run(command, check=True)

# ダウンロードされた字幕ファイル

subtitle_file = "subtitles.ja.srt"

if os.path.exists(subtitle_file):

# SRT をプレーンテキストに変換

with open(subtitle_file, "r", encoding="utf-8") as f:

lines = f.readlines()

text_lines = []

previous_line = None # 直前の行を記録する変数

for line in lines:

stripped_line = clean_html_tags(line.strip()) # HTMLタグを削除

if not re.match(r"^\d+$", stripped_line) and "-->" not in stripped_line: # 字幕番号とタイムスタンプを削除

if stripped_line and stripped_line != previous_line: # 直前の行と異なる場合のみ追加

text_lines.append(stripped_line)

previous_line = stripped_line # 現在の行を記録

# 空行を削除

text_lines = [line for line in text_lines if line]

# プレーンテキストファイルに保存

with open(output_txt, "w", encoding="utf-8") as f:

f.write("\n".join(text_lines))

print(f"プレーンテキスト字幕を {output_txt} に保存しました。")

return "\n".join(text_lines)

else:

print("字幕が取得できませんでした。")

return None

# YouTube動画のURL

video_url = "https://www.youtube.com/watch?v=xxxxxxxxx"

# 字幕を取得

subtitles = get_youtube_subtitles(video_url)

if subtitles:

print(subtitles)

else:

print("字幕が取得できませんでした。")

指定したyoutubeチャンネルから動画URLリストを取得する

import json
import subprocess

# YouTubeチャンネルURL
channel_url = "https://www.youtube.com/@Y-IT-Academia/videos"

# yt-dlpを実行してJSONを取得
result = subprocess.run(["yt-dlp", "--flat-playlist", "-J", channel_url], capture_output=True, text=True)

# JSONを解析
try:
    data = json.loads(result.stdout)
    video_urls = [entry["url"] for entry in data.get("entries", [])]

    # URL一覧を表示
    for url in video_urls:
        print(url)
except json.JSONDecodeError:
    print("エラー: JSONの解析に失敗しました。")

import json

import subprocess

# YouTubeチャンネルURL

channel_url = "https://www.youtube.com/@Y-IT-Academia/videos"

# yt-dlpを実行してJSONを取得

result = subprocess.run(["yt-dlp", "--flat-playlist", "-J", channel_url], capture_output=True, text=True)

# JSONを解析

try:

data = json.loads(result.stdout)

video_urls = [entry["url"] for entry in data.get("entries", [])]

# URL一覧を表示

for url in video_urls:

print(url)

except json.JSONDecodeError:

print("エラー: JSONの解析に失敗しました。")

関連記事