Use m3u8 lib to parse playlists

This commit is contained in:
Ivan Habunek
2019-08-23 12:36:05 +02:00
parent 6813bb51b4
commit 000754af8c
4 changed files with 61 additions and 102 deletions

View File

@ -24,6 +24,7 @@ setup(
packages=['twitchdl'], packages=['twitchdl'],
python_requires='>=3.5', python_requires='>=3.5',
install_requires=[ install_requires=[
"m3u8>=0.3.12,<0.4",
"requests>=2.13,<3.0", "requests>=2.13,<3.0",
], ],
entry_points={ entry_points={

View File

@ -1,12 +1,16 @@
import m3u8
import os import os
import pathlib import pathlib
import re import re
import requests
import shutil
import subprocess import subprocess
import tempfile import tempfile
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from functools import partial from functools import partial
from urllib.parse import urlparse
from twitchdl import twitch from twitchdl import twitch
from twitchdl.download import download_file from twitchdl.download import download_file
@ -96,13 +100,14 @@ def videos(channel_name, limit, offset, sort, **kwargs):
def _select_quality(playlists): def _select_quality(playlists):
print_out("\nAvailable qualities:") print_out("\nAvailable qualities:")
for no, v in playlists.items(): for n, p in enumerate(playlists):
print_out("{}) {}".format(no, v[0])) name = p.media[0].name if p.media else ""
resolution = "x".join(str(r) for r in p.stream_info.resolution)
print_out("{}) {} [{}]".format(n + 1, name, resolution))
keys = list(playlists.keys()) no = read_int("Choose quality", min=1, max=len(playlists) + 1, default=1)
no = read_int("Choose quality", min=min(keys), max=max(keys), default=keys[0])
return playlists[no] return playlists[no - 1]
def _print_progress(futures): def _print_progress(futures):
@ -127,7 +132,7 @@ def _print_progress(futures):
def _download_files(base_url, directory, filenames, max_workers): def _download_files(base_url, directory, filenames, max_workers):
urls = [base_url.format(f) for f in filenames] urls = [base_url + f for f in filenames]
paths = ["/".join([directory, f]) for f in filenames] paths = ["/".join([directory, f]) for f in filenames]
partials = (partial(download_file, url, path) for url, path in zip(urls, paths)) partials = (partial(download_file, url, path) for url, path in zip(urls, paths))
@ -172,7 +177,7 @@ def _video_target_filename(video, format):
return name + "." + format return name + "." + format
def parse_video_id(video_id): def _parse_video_id(video_id):
"""This can be either a integer ID or an URL to the video on twitch.""" """This can be either a integer ID or an URL to the video on twitch."""
if re.search(r"^\d+$", video_id): if re.search(r"^\d+$", video_id):
return int(video_id) return int(video_id)
@ -184,8 +189,33 @@ def parse_video_id(video_id):
raise ConsoleError("Invalid video ID given, expected integer ID or Twitch URL") raise ConsoleError("Invalid video ID given, expected integer ID or Twitch URL")
def _get_files(playlist, start, end):
"""Extract files for download from playlist."""
vod_start = 0
for segment in playlist.segments:
vod_end = vod_start + segment.duration
# `vod_end > start` is used here becuase it's better to download a bit
# more than a bit less, similar for the end condition
start_condition = not start or vod_end > start
end_condition = not end or vod_start < end
if start_condition and end_condition:
yield segment.uri
vod_start = vod_end
def _crete_temp_dir(base_uri):
"""Create a temp dir to store downloads if it doesn't exist."""
path = urlparse(base_uri).path
directory = '{}/twitch-dl{}'.format(tempfile.gettempdir(), path)
pathlib.Path(directory).mkdir(parents=True, exist_ok=True)
return directory
def download(video_id, max_workers, format='mkv', start=None, end=None, keep=False, **kwargs): def download(video_id, max_workers, format='mkv', start=None, end=None, keep=False, **kwargs):
video_id = parse_video_id(video_id) video_id = _parse_video_id(video_id)
if start and end and end <= start: if start and end and end <= start:
raise ConsoleError("End time must be greater than start time") raise ConsoleError("End time must be greater than start time")
@ -199,33 +229,34 @@ def download(video_id, max_workers, format='mkv', start=None, end=None, keep=Fal
print_out("Fetching access token...") print_out("Fetching access token...")
access_token = twitch.get_access_token(video_id) access_token = twitch.get_access_token(video_id)
# TODO: save playlists for debugging purposes
print_out("Fetching playlists...") print_out("Fetching playlists...")
playlists = twitch.get_playlists(video_id, access_token) playlists = twitch.get_playlists(video_id, access_token)
quality, playlist_url = _select_quality(playlists) playlists = m3u8.loads(playlists)
selected = _select_quality(playlists.playlists)
print_out("\nFetching playlist...") print_out("\nFetching playlist...")
base_url, filenames = twitch.get_playlist_urls(playlist_url, start, end) response = requests.get(selected.uri)
response.raise_for_status()
playlist = m3u8.loads(response.text)
if not filenames: base_uri = re.sub("/[^/]+$", "/", selected.uri)
raise ConsoleError("No vods matched, check your start and end times") target_dir = _crete_temp_dir(base_uri)
filenames = list(_get_files(playlist, start, end))
# Create a temp dir to store downloads if it doesn't exist print_out("\nDownloading {} VODs using {} workers to {}".format(
directory = '{}/twitch-dl/{}/{}'.format(tempfile.gettempdir(), video_id, quality) len(filenames), max_workers, target_dir))
pathlib.Path(directory).mkdir(parents=True, exist_ok=True) _download_files(base_uri, target_dir, filenames, max_workers)
print_out("Download dir: {}".format(directory))
print_out("Downloading {} VODs using {} workers...".format(len(filenames), max_workers))
paths = _download_files(base_url, directory, filenames, max_workers)
print_out("\n\nJoining files...") print_out("\n\nJoining files...")
target = _video_target_filename(video, format) target = _video_target_filename(video, format)
_join_vods(directory, paths, target) _join_vods(target_dir, filenames, target)
if keep: if keep:
print_out("\nTemporary files not deleted: {}".format(directory)) print_out("\nTemporary files not deleted: {}".format(target_dir))
else: else:
print_out("\nDeleting vods...") print_out("\nDeleting temporary files...")
for path in paths: shutil.rmtree(target_dir)
os.unlink(path)
print_out("Downloaded: {}".format(target)) print_out("Downloaded: {}".format(target))

View File

@ -1,64 +0,0 @@
import re
from collections import OrderedDict
from datetime import timedelta
from twitchdl.exceptions import ConsoleError
def parse_playlists(data):
media_pattern = re.compile(r'^#EXT-X-MEDIA:TYPE=VIDEO,GROUP-ID="(?P<group>\w+)",NAME="(?P<name>\w+)"')
playlists = OrderedDict()
n = 1
name = None
for line in data.split():
match = re.match(media_pattern, line)
if match:
name = match.group('name')
elif line.startswith('http'):
playlists[n] = (name, line)
n += 1
return playlists
def _get_files(playlist, start, end):
matches = re.findall(r"#EXTINF:(\d+)(\.\d+)?,.*?\s+(\d+.ts)", playlist)
vod_start = 0
for m in matches:
filename = m[2]
vod_duration = int(m[0])
vod_end = vod_start + vod_duration
# `vod_end > start` is used here becuase it's better to download a bit
# more than a bit less, similar for the end condition
start_condition = not start or vod_end > start
end_condition = not end or vod_start < end
if start_condition and end_condition:
yield filename
vod_start = vod_end
def parse_playlist(url, playlist, start, end):
base_url = re.sub("/[^/]+$", "/{}", url)
match = re.search(r"#EXT-X-TWITCH-TOTAL-SECS:(\d+)(.\d+)?", playlist)
total_seconds = int(match.group(1))
# Now that video duration is known, validate start and end max values
if start and start > total_seconds:
raise ConsoleError("Start time {} greater than video duration {}".format(
timedelta(seconds=start),
timedelta(seconds=total_seconds)
))
if end and end > total_seconds:
raise ConsoleError("End time {} greater than video duration {}".format(
timedelta(seconds=end),
timedelta(seconds=total_seconds)
))
files = list(_get_files(playlist, start, end))
return base_url, files

View File

@ -6,7 +6,6 @@ import requests
from twitchdl import CLIENT_ID from twitchdl import CLIENT_ID
from twitchdl.exceptions import ConsoleError from twitchdl.exceptions import ConsoleError
from twitchdl.parse import parse_playlists, parse_playlist
def authenticated_get(url, params={}, headers={}): def authenticated_get(url, params={}, headers={}):
@ -73,6 +72,9 @@ def get_access_token(video_id):
def get_playlists(video_id, access_token): def get_playlists(video_id, access_token):
"""
For a given video return a playlist which contains possible video qualities.
"""
url = "http://usher.twitch.tv/vod/{}".format(video_id) url = "http://usher.twitch.tv/vod/{}".format(video_id)
response = requests.get(url, params={ response = requests.get(url, params={
@ -82,15 +84,4 @@ def get_playlists(video_id, access_token):
"player": "twitchweb", "player": "twitchweb",
}) })
response.raise_for_status() response.raise_for_status()
return response.content.decode('utf-8')
data = response.content.decode('utf-8')
return parse_playlists(data)
def get_playlist_urls(url, start, end):
response = requests.get(url)
response.raise_for_status()
data = response.content.decode('utf-8')
return parse_playlist(url, data, start, end)