From e4942b188f5c192d5693687698db9b106571332c Mon Sep 17 00:00:00 2001 From: syeopite <syeopite@syeopite.dev> Date: Sun, 23 Jul 2023 05:02:02 -0700 Subject: [PATCH] Integrate transcript captions into captions API --- config/config.example.yml | 13 +++ src/invidious/config.cr | 3 + src/invidious/routes/api/v1/videos.cr | 112 ++++++++++++++------------ src/invidious/videos/caption.cr | 11 ++- src/invidious/videos/transcript.cr | 6 ++ 5 files changed, 91 insertions(+), 54 deletions(-) diff --git a/config/config.example.yml b/config/config.example.yml index 34070fe5..51beab89 100644 --- a/config/config.example.yml +++ b/config/config.example.yml @@ -182,6 +182,19 @@ https_only: false #force_resolve: +## +## Use Innertube's transcripts API instead of timedtext for closed captions +## +## Useful for larger instances as InnerTube is **not ratelimited**. See https://github.com/iv-org/invidious/issues/2567 +## +## Subtitle experience may differ slightly on Invidious. +## +## Accepted values: true, false +## Default: false +## +# use_innertube_for_captions: false + + # ----------------------------- # Logging # ----------------------------- diff --git a/src/invidious/config.cr b/src/invidious/config.cr index e5f1e822..c88a4837 100644 --- a/src/invidious/config.cr +++ b/src/invidious/config.cr @@ -129,6 +129,9 @@ class Config # Use quic transport for youtube api property use_quic : Bool = false + # Use Innertube's transcripts API instead of timedtext for closed captions + property use_innertube_for_captions : Bool = false + # Saved cookies in "name1=value1; name2=value2..." format @[YAML::Field(converter: Preferences::StringToCookies)] property cookies : HTTP::Cookies = HTTP::Cookies.new diff --git a/src/invidious/routes/api/v1/videos.cr b/src/invidious/routes/api/v1/videos.cr index af4fc806..000e64b9 100644 --- a/src/invidious/routes/api/v1/videos.cr +++ b/src/invidious/routes/api/v1/videos.cr @@ -87,70 +87,78 @@ module Invidious::Routes::API::V1::Videos caption = caption[0] end - url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target + if CONFIG.use_innertube_for_captions + params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated) + initial_data = YoutubeAPI.transcript(params.to_s) - # Auto-generated captions often have cues that aren't aligned properly with the video, - # as well as some other markup that makes it cumbersome, so we try to fix that here - if caption.name.includes? "auto-generated" - caption_xml = YT_POOL.client &.get(url).body + webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code) + else + # Timedtext API handling + url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target - if caption_xml.starts_with?("<?xml") - webvtt = caption.timedtext_to_vtt(caption_xml, tlang) - else - caption_xml = XML.parse(caption_xml) + # Auto-generated captions often have cues that aren't aligned properly with the video, + # as well as some other markup that makes it cumbersome, so we try to fix that here + if caption.name.includes? "auto-generated" + caption_xml = YT_POOL.client &.get(url).body - webvtt = String.build do |str| - str << <<-END_VTT - WEBVTT - Kind: captions - Language: #{tlang || caption.language_code} + if caption_xml.starts_with?("<?xml") + webvtt = caption.timedtext_to_vtt(caption_xml, tlang) + else + caption_xml = XML.parse(caption_xml) + + webvtt = String.build do |str| + str << <<-END_VTT + WEBVTT + Kind: captions + Language: #{tlang || caption.language_code} - END_VTT + END_VTT - caption_nodes = caption_xml.xpath_nodes("//transcript/text") - caption_nodes.each_with_index do |node, i| - start_time = node["start"].to_f.seconds - duration = node["dur"]?.try &.to_f.seconds - duration ||= start_time + caption_nodes = caption_xml.xpath_nodes("//transcript/text") + caption_nodes.each_with_index do |node, i| + start_time = node["start"].to_f.seconds + duration = node["dur"]?.try &.to_f.seconds + duration ||= start_time - if caption_nodes.size > i + 1 - end_time = caption_nodes[i + 1]["start"].to_f.seconds - else - end_time = start_time + duration + if caption_nodes.size > i + 1 + end_time = caption_nodes[i + 1]["start"].to_f.seconds + else + end_time = start_time + duration + end + + start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}" + end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}" + + text = HTML.unescape(node.content) + text = text.gsub(/<font color="#[a-fA-F0-9]{6}">/, "") + text = text.gsub(/<\/font>/, "") + if md = text.match(/(?<name>.*) : (?<text>.*)/) + text = "<v #{md["name"]}>#{md["text"]}</v>" + end + + str << <<-END_CUE + #{start_time} --> #{end_time} + #{text} + + + END_CUE end - - start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}" - end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}" - - text = HTML.unescape(node.content) - text = text.gsub(/<font color="#[a-fA-F0-9]{6}">/, "") - text = text.gsub(/<\/font>/, "") - if md = text.match(/(?<name>.*) : (?<text>.*)/) - text = "<v #{md["name"]}>#{md["text"]}</v>" - end - - str << <<-END_CUE - #{start_time} --> #{end_time} - #{text} - - - END_CUE end end - end - else - # Some captions have "align:[start/end]" and "position:[num]%" - # attributes. Those are causing issues with VideoJS, which is unable - # to properly align the captions on the video, so we remove them. - # - # See: https://github.com/iv-org/invidious/issues/2391 - webvtt = YT_POOL.client &.get("#{url}&format=vtt").body - if webvtt.starts_with?("<?xml") - webvtt = caption.timedtext_to_vtt(webvtt) else + # Some captions have "align:[start/end]" and "position:[num]%" + # attributes. Those are causing issues with VideoJS, which is unable + # to properly align the captions on the video, so we remove them. + # + # See: https://github.com/iv-org/invidious/issues/2391 webvtt = YT_POOL.client &.get("#{url}&format=vtt").body - .gsub(/([0-9:.]{12} --> [0-9:.]{12}).+/, "\\1") + if webvtt.starts_with?("<?xml") + webvtt = caption.timedtext_to_vtt(webvtt) + else + webvtt = YT_POOL.client &.get("#{url}&format=vtt").body + .gsub(/([0-9:.]{12} --> [0-9:.]{12}).+/, "\\1") + end end end diff --git a/src/invidious/videos/caption.cr b/src/invidious/videos/caption.cr index c85b46c3..1e2abde9 100644 --- a/src/invidious/videos/caption.cr +++ b/src/invidious/videos/caption.cr @@ -6,7 +6,9 @@ module Invidious::Videos property language_code : String property base_url : String - def initialize(@name, @language_code, @base_url) + property auto_generated : Bool + + def initialize(@name, @language_code, @base_url, @auto_generated) end # Parse the JSON structure from Youtube @@ -25,7 +27,12 @@ module Invidious::Videos language_code = caption["languageCode"].to_s base_url = caption["baseUrl"].to_s - captions_list << CaptionMetadata.new(name, language_code, base_url) + auto_generated = false + if caption["kind"]? && caption["kind"] == "asr" + auto_generated = true + end + + captions_list << CaptionMetadata.new(name, language_code, base_url, auto_generated) end return captions_list diff --git a/src/invidious/videos/transcript.cr b/src/invidious/videos/transcript.cr index ec990883..ba2728cd 100644 --- a/src/invidious/videos/transcript.cr +++ b/src/invidious/videos/transcript.cr @@ -85,7 +85,13 @@ module Invidious::Videos lines = [] of TranscriptLine body.each do |line| + # Transcript section headers. They are not apart of the captions and as such we can safely skip them. + if line.as_h.has_key?("transcriptSectionHeaderRenderer") + next + end + line = line["transcriptSegmentRenderer"] + start_ms = line["startMs"].as_s.to_i.millisecond end_ms = line["endMs"].as_s.to_i.millisecond