youtubeの自動生成字幕から、chatgptAPIに4択問題を作ってもらうpythonスクリプト。けっこう、普通に正解が間違っているけど、まあ、さらにchatgptに投げて精度を高めればいいか。

import openai
import json
import os
import re
import subprocess
from urllib.parse import urlparse, parse_qs

# OpenAI APIキーを環境変数から取得
# setx OPENAI_API_KEY "your-api-key"
API_KEY = os.getenv("OPENAI_API_KEY")

# 使用するモデル（`gpt-4o-2024-11-20` でエラーが出る場合は `gpt-3.5-turbo` に変更）
MODEL_NAME = "gpt-4o-2024-11-20"

# YouTubeの動画URLリスト
VIDEO_URLS = [
    "https://www.youtube.com/watch?v=aaaaaaaaaa",
    "https://www.youtube.com/watch?v=bbbbbbbbbb",
    "https://www.youtube.com/watch?v=cccccccccc"
]

def clean_html_tags(text):
    """HTMLタグ（特に<font>タグ）を削除する"""
    return re.sub(r'<[^>]+>', '', text)  # すべてのHTMLタグを削除

def get_video_id(video_url):
    """YouTubeのURLから動画IDを確実に抽出する"""
    parsed_url = urlparse(video_url)
    video_id = parse_qs(parsed_url.query).get("v")
    return video_id[0] if video_id else "unknown"

def get_youtube_subtitles(video_url):
    """YouTube動画の自動生成された日本語字幕を取得し、プレーンテキストに変換する"""
    
    video_id = get_video_id(video_url)
    output_srt = f"subtitles_{video_id}.ja.srt"
    output_txt = f"subtitles_{video_id}.txt"

    # yt-dlp で SRT 字幕を取得
    command = [
        "yt-dlp",
        "--write-auto-sub",  # 自動生成字幕を取得
        "--sub-langs", "ja",  # 日本語字幕を指定
        "--sub-format", "ttml",  # 字幕の間隔調整（これを指定しないと重複する！）
        "--convert-subs", "srt",  # SRT形式で取得
        "--skip-download",  # 動画はダウンロードしない
        "--output", f"subtitles_{video_id}.%(ext)s",  # 動画IDごとにファイル名を指定
        video_url
    ]


    subprocess.run(command, check=True)

    # ダウンロードされた字幕ファイルを探す
    if not os.path.exists(output_srt):
        print(f"字幕が取得できませんでした: {video_url}")
        return None

    # SRT をプレーンテキストに変換
    with open(output_srt, "r", encoding="utf-8") as f:
        lines = f.readlines()

    text_lines = []
    previous_line = None  # 直前の行を記録する変数

    for line in lines:
        stripped_line = clean_html_tags(line.strip())  # HTMLタグを削除
        if not re.match(r"^\d+$", stripped_line) and "-->" not in stripped_line:  # 字幕番号とタイムスタンプを削除
            if stripped_line and stripped_line != previous_line:  # 直前の行と異なる場合のみ追加
                text_lines.append(stripped_line)
            previous_line = stripped_line  # 現在の行を記録

    # 空行を削除
    text_lines = [line for line in text_lines if line]

    # プレーンテキストファイルに保存
    with open(output_txt, "w", encoding="utf-8") as f:
        f.write("\n".join(text_lines))

    print(f"プレーンテキスト字幕を {output_txt} に保存しました。")
    return output_txt

def extract_json_from_text(text):
    """
    ChatGPTのレスポンスからJSON部分のみを抽出する関数
    """
    match = re.search(r"```json\n(.*?)\n```", text, re.DOTALL)
    return match.group(1) if match else text  # JSON部分のみ抽出

def generate_mcq_from_subtitles(subtitles_file):
    """
    YouTube字幕テキストから ChatGPT API を使用して4択問題を生成する
    """
    with open(subtitles_file, "r", encoding="utf-8") as file:
        subtitles_text = file.read()

    prompt = f"""
    以下の字幕テキストを元に、4択問題を5問だけ作成してください。
    - 必ず字幕の内容から、問題を生成する
    - 各問題は1つの正解と3つのそれっぽい不正解を含める
    - 正解の選択肢番号も明記する
    - 出力は JSON 形式で、次のような構造にする：
    ```json
    {{
        "questions": [
            {{
                "question": "問題文",
                "choices": ["選択肢1", "選択肢2", "選択肢3", "選択肢4"],
                "answer_index": 1
            }},
            ...
        ]
    }}
    ```
    
    字幕テキスト:
    {subtitles_text}
    """

    client = openai.OpenAI(api_key=API_KEY)

    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                      {"role": "user", "content": prompt}]
        )

        response_text = response.choices[0].message.content.strip()
        json_text = extract_json_from_text(response_text)

        try:
            mcq_data = json.loads(json_text)
        except json.JSONDecodeError as e:
            print(f"JSONデコードエラー: {e}")
            return None

        return mcq_data

    except Exception as e:
        print(f"エラーが発生しました: {e}")
        return None

def save_mcq_to_file(mcq_data, video_id):
    """
    生成された4択問題をテキストファイルとして保存する
    """
    output_file = f"mcq_questions_{video_id}.txt"
    if not mcq_data:
        print(f"4択問題の生成に失敗しました: {video_id}")
        return

    with open(output_file, "w", encoding="utf-8") as file:
        for i, q in enumerate(mcq_data["questions"], 1):
            file.write(f"問題 {i}:\n")
            file.write(f"{q['question']}\n")
            for j, choice in enumerate(q["choices"], 1):
                file.write(f"{j}. {choice}\n")
            file.write(f"\n正解: {q['answer_index']}. {q['choices'][q['answer_index']-1]}\n")
            file.write("-" * 50 + "\n")

    print(f"4択問題を {output_file} に保存しました。")

def main():
    """
    YouTube動画の字幕を取得し、4択問題を生成する
    """
    for video_url in VIDEO_URLS:
        video_id = get_video_id(video_url)
        subtitles_file = get_youtube_subtitles(video_url)
        if subtitles_file:
            mcq_data = generate_mcq_from_subtitles(subtitles_file)
            save_mcq_to_file(mcq_data, video_id)

if __name__ == "__main__":
    main()

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

import openai

import json

import os

import re

import subprocess

from urllib.parse import urlparse, parse_qs

# OpenAI APIキーを環境変数から取得

# setx OPENAI_API_KEY "your-api-key"

API_KEY = os.getenv("OPENAI_API_KEY")

# 使用するモデル（`gpt-4o-2024-11-20` でエラーが出る場合は `gpt-3.5-turbo` に変更）

MODEL_NAME = "gpt-4o-2024-11-20"

# YouTubeの動画URLリスト

VIDEO_URLS = [

"https://www.youtube.com/watch?v=aaaaaaaaaa",

"https://www.youtube.com/watch?v=bbbbbbbbbb",

"https://www.youtube.com/watch?v=cccccccccc"

]

def clean_html_tags(text):

"""HTMLタグ（特に<font>タグ）を削除する"""

return re.sub(r'<[^>]+>', '', text) # すべてのHTMLタグを削除

def get_video_id(video_url):

"""YouTubeのURLから動画IDを確実に抽出する"""

parsed_url = urlparse(video_url)

video_id = parse_qs(parsed_url.query).get("v")

return video_id[0] if video_id else "unknown"

def get_youtube_subtitles(video_url):

"""YouTube動画の自動生成された日本語字幕を取得し、プレーンテキストに変換する"""

video_id = get_video_id(video_url)

output_srt = f"subtitles_{video_id}.ja.srt"

output_txt = f"subtitles_{video_id}.txt"

# yt-dlp で SRT 字幕を取得

command = [

"yt-dlp",

"--write-auto-sub", # 自動生成字幕を取得

"--sub-langs", "ja", # 日本語字幕を指定

"--sub-format", "ttml", # 字幕の間隔調整（これを指定しないと重複する！）

"--convert-subs", "srt", # SRT形式で取得

"--skip-download", # 動画はダウンロードしない

"--output", f"subtitles_{video_id}.%(ext)s", # 動画IDごとにファイル名を指定

video_url

]

subprocess.run(command, check=True)

# ダウンロードされた字幕ファイルを探す

if not os.path.exists(output_srt):

print(f"字幕が取得できませんでした: {video_url}")

return None

# SRT をプレーンテキストに変換

with open(output_srt, "r", encoding="utf-8") as f:

lines = f.readlines()

text_lines = []

previous_line = None # 直前の行を記録する変数

for line in lines:

stripped_line = clean_html_tags(line.strip()) # HTMLタグを削除

if not re.match(r"^\d+$", stripped_line) and "-->" not in stripped_line: # 字幕番号とタイムスタンプを削除

if stripped_line and stripped_line != previous_line: # 直前の行と異なる場合のみ追加

text_lines.append(stripped_line)

previous_line = stripped_line # 現在の行を記録

# 空行を削除

text_lines = [line for line in text_lines if line]

# プレーンテキストファイルに保存

with open(output_txt, "w", encoding="utf-8") as f:

f.write("\n".join(text_lines))

print(f"プレーンテキスト字幕を {output_txt} に保存しました。")

return output_txt

def extract_json_from_text(text):

"""

ChatGPTのレスポンスからJSON部分のみを抽出する関数

"""

match = re.search(r"```json\n(.*?)\n```", text, re.DOTALL)

return match.group(1) if match else text # JSON部分のみ抽出

def generate_mcq_from_subtitles(subtitles_file):

"""

YouTube字幕テキストから ChatGPT API を使用して4択問題を生成する

"""

with open(subtitles_file, "r", encoding="utf-8") as file:

subtitles_text = file.read()

prompt = f"""

以下の字幕テキストを元に、4択問題を5問だけ作成してください。

- 必ず字幕の内容から、問題を生成する

- 各問題は1つの正解と3つのそれっぽい不正解を含める

- 正解の選択肢番号も明記する

- 出力は JSON 形式で、次のような構造にする：

```json

{{

"questions": [

{{

"question": "問題文",

"choices": ["選択肢1", "選択肢2", "選択肢3", "選択肢4"],

"answer_index": 1

}},

...

]

}}

```

字幕テキスト:

{subtitles_text}

"""

client = openai.OpenAI(api_key=API_KEY)

try:

response = client.chat.completions.create(

model=MODEL_NAME,

messages=[{"role": "system", "content": "You are a helpful assistant."},

{"role": "user", "content": prompt}]

)

response_text = response.choices[0].message.content.strip()

json_text = extract_json_from_text(response_text)

try:

mcq_data = json.loads(json_text)

except json.JSONDecodeError as e:

print(f"JSONデコードエラー: {e}")

return None

return mcq_data

except Exception as e:

print(f"エラーが発生しました: {e}")

return None

def save_mcq_to_file(mcq_data, video_id):

"""

生成された4択問題をテキストファイルとして保存する

"""

output_file = f"mcq_questions_{video_id}.txt"

if not mcq_data:

print(f"4択問題の生成に失敗しました: {video_id}")

return

with open(output_file, "w", encoding="utf-8") as file:

for i, q in enumerate(mcq_data["questions"], 1):

file.write(f"問題 {i}:\n")

file.write(f"{q['question']}\n")

for j, choice in enumerate(q["choices"], 1):

file.write(f"{j}. {choice}\n")

file.write(f"\n正解: {q['answer_index']}. {q['choices'][q['answer_index']-1]}\n")

file.write("-" * 50 + "\n")

print(f"4択問題を {output_file} に保存しました。")

def main():

"""

YouTube動画の字幕を取得し、4択問題を生成する

"""

for video_url in VIDEO_URLS:

video_id = get_video_id(video_url)

subtitles_file = get_youtube_subtitles(video_url)

if subtitles_file:

mcq_data = generate_mcq_from_subtitles(subtitles_file)

save_mcq_to_file(mcq_data, video_id)

if __name__ == "__main__":

main()

関連記事