Improve download process

* allow resuming
* retry downloads
* add timeouts
This commit is contained in:
Ivan Habunek 2019-01-29 14:46:24 +01:00
parent 6dd65e7cbc
commit da60560b63
No known key found for this signature in database
GPG Key ID: CDBD63C43A30BB95
3 changed files with 90 additions and 23 deletions

View File

@ -1,3 +1,5 @@
import os
import pathlib
import re import re
import subprocess import subprocess
import tempfile import tempfile
@ -5,10 +7,11 @@ import tempfile
from datetime import datetime from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial from functools import partial
from urllib.request import urlretrieve
from twitchdl import twitch from twitchdl import twitch
from twitchdl.download import download_file
from twitchdl.output import print_out from twitchdl.output import print_out
from twitchdl.utils import slugify
def read_int(msg, min, max, default): def read_int(msg, min, max, default):
@ -76,7 +79,7 @@ def videos(channel_name, **kwargs):
_print_video(video) _print_video(video)
def _select_playlist_by_quality(playlists): def _select_quality(playlists):
print("\nAvailable qualities:") print("\nAvailable qualities:")
for no, v in playlists.items(): for no, v in playlists.items():
print("{}) {}".format(no, v[0])) print("{}) {}".format(no, v[0]))
@ -84,7 +87,7 @@ def _select_playlist_by_quality(playlists):
keys = list(playlists.keys()) keys = list(playlists.keys())
no = read_int("Choose quality", min=min(keys), max=max(keys), default=keys[0]) no = read_int("Choose quality", min=min(keys), max=max(keys), default=keys[0])
return playlists[no][1] return playlists[no]
def _print_progress(futures): def _print_progress(futures):
@ -94,9 +97,9 @@ def _print_progress(futures):
start_time = datetime.now() start_time = datetime.now()
for future in as_completed(futures): for future in as_completed(futures):
file, headers = future.result() size = future.result()
percentage = 100 * counter // total percentage = 100 * counter // total
total_size += int(headers.get("Content-Length")) total_size += size
duration = (datetime.now() - start_time).seconds duration = (datetime.now() - start_time).seconds
speed = total_size // duration if duration else 0 speed = total_size // duration if duration else 0
remaining = (total - counter) * duration / counter remaining = (total - counter) * duration / counter
@ -109,23 +112,23 @@ def _print_progress(futures):
def _download_files(base_url, directory, filenames, max_workers): def _download_files(base_url, directory, filenames, max_workers):
args = [(base_url.format(f), "/".join([directory, f])) for f in filenames] urls = [base_url.format(f) for f in filenames]
paths = ["/".join([directory, f]) for f in filenames]
fns = [partial(urlretrieve, url, path) for url, path in args] partials = (partial(download_file, url, path) for url, path in zip(urls, paths))
with ThreadPoolExecutor(max_workers=max_workers) as executor: with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(fn) for fn in fns] futures = [executor.submit(fn) for fn in partials]
_print_progress(futures) _print_progress(futures)
return [f.result()[0] for f in futures] return paths
def _join_vods(directory, filenames, target): def _join_vods(directory, paths, target):
input_path = "{}/files.txt".format(directory) input_path = "{}/files.txt".format(directory)
with open(input_path, 'w') as f: with open(input_path, 'w') as f:
for filename in filenames: for path in paths:
f.write('file {}\n'.format(filename)) f.write('file {}\n'.format(os.path.basename(path)))
result = subprocess.run([ result = subprocess.run([
"ffmpeg", "ffmpeg",
@ -141,9 +144,17 @@ def _join_vods(directory, filenames, target):
def _video_target_filename(video, format): def _video_target_filename(video, format):
dttm = re.sub(r'\D+', '-', video['published_at'][:16]) match = re.search(r"^(\d{4})-(\d{2})-(\d{2})T", video['published_at'])
name = " - ".join([dttm, video['channel']['display_name'], video['game']]) date = "".join(match.groups())
return "{}.{}".format(name, format)
name = "_".join([
date,
video['_id'][1:],
video['channel']['name'],
slugify(video['title']),
])
return name + "." + format
def download(video_id, max_workers, format='mkv', **kwargs): def download(video_id, max_workers, format='mkv', **kwargs):
@ -155,18 +166,25 @@ def download(video_id, max_workers, format='mkv', **kwargs):
print("Fetching playlists...") print("Fetching playlists...")
playlists = twitch.get_playlists(video_id, access_token) playlists = twitch.get_playlists(video_id, access_token)
playlist_url = _select_playlist_by_quality(playlists) quality, playlist_url = _select_quality(playlists)
print("\nFetching playlist...") print("\nFetching playlist...")
base_url, filenames = twitch.get_playlist_urls(playlist_url) base_url, filenames = twitch.get_playlist_urls(playlist_url)
target = _video_target_filename(video, format) # Create a temp dir to store downloads if it doesn't exist
directory = '{}/twitch-dl/{}/{}'.format(tempfile.gettempdir(), video_id, quality)
pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
print("Download dir: {}".format(directory))
with tempfile.TemporaryDirectory() as directory: print("Downloading VODs with {} workers...".format(max_workers))
print("Downloading with {} workers...".format(max_workers)) paths = _download_files(base_url, directory, filenames, max_workers)
_download_files(base_url, directory, filenames, max_workers)
print("\n\nJoining files...") print("\n\nJoining files...")
_join_vods(directory, filenames, target) target = _video_target_filename(video, format)
_join_vods(directory, paths, target)
print("\nDeleting vods...")
for path in paths:
os.unlink(path)
print("\nDownloaded: {}".format(target)) print("\nDownloaded: {}".format(target))

38
twitchdl/download.py Normal file
View File

@ -0,0 +1,38 @@
import os
import requests
from requests.exceptions import RequestException
CHUNK_SIZE = 1024
CONNECT_TIMEOUT = 5
class DownloadFailed(Exception):
pass
def _download(url, path):
tmp_path = path + ".tmp"
response = requests.get(url, stream=True, timeout=CONNECT_TIMEOUT)
size = 0
with open(tmp_path, 'wb') as target:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
target.write(chunk)
size += len(chunk)
os.rename(tmp_path, path)
return size
def download_file(url, path, retries=3):
if os.path.exists(path):
return 0
for _ in range(retries):
try:
return _download(url, path)
except RequestException as e:
print("Download failed: {}".format(e))
raise DownloadFailed(":(")

11
twitchdl/utils.py Normal file
View File

@ -0,0 +1,11 @@
import re
import unicodedata
def slugify(value):
re_pattern = re.compile(r'[^\w\s-]', flags=re.U)
re_spaces = re.compile(r'[-\s]+', flags=re.U)
value = str(value)
value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
value = re_pattern.sub('', value).strip().lower()
return re_spaces.sub('-', value)