From bf2a4558f4b75eca2407f9a3047360d60f1755e2 Mon Sep 17 00:00:00 2001
From: Ivan Habunek <ivan@habunek.com>
Date: Thu, 3 Sep 2020 10:33:13 +0200
Subject: [PATCH] Improve VOD joining logic

Instead of creating a file list, create a modified playlist which
references the downloaded files, and give this as input to ffmpeg. Since
ffmpeg handles M3U8 playlists, this means options such as
`EXT-X-BYTERANGE` are supported.

issue #35
---
 twitchdl/commands.py | 36 +++++++++++++++++++-----------------
 twitchdl/download.py | 15 ++++++++++-----
 2 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/twitchdl/commands.py b/twitchdl/commands.py
index 0cc0d5b..134f3d0 100644
--- a/twitchdl/commands.py
+++ b/twitchdl/commands.py
@@ -103,17 +103,10 @@ def _select_playlist_interactive(playlists):
     return uri
 
 
-def _join_vods(directory, file_paths, target):
-    input_path = "{}files.txt".format(directory)
-
-    with open(input_path, 'w') as f:
-        for path in file_paths:
-            f.write('file {}\n'.format(os.path.basename(path)))
-
+def _join_vods(playlist_path, target):
     command = [
         "ffmpeg",
-        "-f", "concat",
-        "-i", input_path,
+        "-i", playlist_path,
         "-c", "copy",
         target,
         "-stats",
@@ -140,8 +133,9 @@ def _video_target_filename(video, format):
     return name + "." + format
 
 
-def _get_files(playlist, start, end):
-    """Extract files for download from playlist."""
+def _get_vod_paths(playlist, start, end):
+    """Extract unique VOD paths for download from playlist."""
+    files = []
     vod_start = 0
     for segment in playlist.segments:
         vod_end = vod_start + segment.duration
@@ -151,11 +145,13 @@ def _get_files(playlist, start, end):
         start_condition = not start or vod_end > start
         end_condition = not end or vod_start < end
 
-        if start_condition and end_condition:
-            yield segment.uri
+        if start_condition and end_condition and segment.uri not in files:
+            files.append(segment.uri)
 
         vod_start = vod_end
 
+    return files
+
 
 def _crete_temp_dir(base_uri):
     """Create a temp dir to store downloads if it doesn't exist."""
@@ -275,7 +271,7 @@ def _download_video(video_id, args):
 
     base_uri = re.sub("/[^/]+$", "/", playlist_uri)
     target_dir = _crete_temp_dir(base_uri)
-    filenames = list(_get_files(playlist, args.start, args.end))
+    vod_paths = _get_vod_paths(playlist, args.start, args.end)
 
     # Save playlists for debugging purposes
     with open(target_dir + "playlists.m3u8", "w") as f:
@@ -284,12 +280,18 @@ def _download_video(video_id, args):
         f.write(response.text)
 
     print_out("\nDownloading {} VODs using {} workers to {}".format(
-        len(filenames), args.max_workers, target_dir))
-    file_paths = download_files(base_uri, target_dir, filenames, args.max_workers)
+        len(vod_paths), args.max_workers, target_dir))
+    path_map = download_files(base_uri, target_dir, vod_paths, args.max_workers)
+
+    # Make a modified playlist which references downloaded VODs
+    for segment in playlist.segments:
+        segment.uri = path_map[segment.uri]
+    playlist_path = target_dir + "playlist_downloaded.m3u8"
+    playlist.dump(playlist_path)
 
     print_out("\n\nJoining files...")
     target = _video_target_filename(video, args.format)
-    _join_vods(target_dir, file_paths, target)
+    _join_vods(playlist_path, target)
 
     if args.keep:
         print_out("\nTemporary files not deleted: {}".format(target_dir))
diff --git a/twitchdl/download.py b/twitchdl/download.py
index 37b5166..1d54e96 100644
--- a/twitchdl/download.py
+++ b/twitchdl/download.py
@@ -1,6 +1,7 @@
 import os
 import requests
 
+from collections import OrderedDict
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from datetime import datetime
 from functools import partial
@@ -75,13 +76,17 @@ def _print_progress(futures):
         print_out("\r" + msg.ljust(max_msg_size), end="")
 
 
-def download_files(base_url, directory, filenames, max_workers):
-    urls = [base_url + f for f in filenames]
-    paths = ["{}{:05d}.vod".format(directory, k) for k, _ in enumerate(filenames)]
-    partials = (partial(download_file, url, path) for url, path in zip(urls, paths))
+def download_files(base_url, target_dir, vod_paths, max_workers):
+    """
+    Downloads a list of VODs defined by a common `base_url` and a list of
+    `vod_paths`, returning a dict which maps the paths to the downloaded files.
+    """
+    urls = [base_url + path for path in vod_paths]
+    targets = ["{}{:05d}.ts".format(target_dir, k) for k, _ in enumerate(vod_paths)]
+    partials = (partial(download_file, url, path) for url, path in zip(urls, targets))
 
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         futures = [executor.submit(fn) for fn in partials]
         _print_progress(futures)
 
-    return paths
+    return OrderedDict(zip(vod_paths, targets))