2022-08-13 09:41:13 +00:00
|
|
|
import asyncio
|
|
|
|
import logging
|
2022-08-14 08:02:41 +00:00
|
|
|
import os
|
2022-08-13 09:41:13 +00:00
|
|
|
import time
|
2024-03-23 06:56:50 +00:00
|
|
|
from abc import ABC, abstractmethod
|
2024-08-28 08:59:23 +00:00
|
|
|
from pathlib import Path
|
2024-08-30 05:37:50 +00:00
|
|
|
from typing import Iterable, Optional, Tuple
|
2022-08-13 09:41:13 +00:00
|
|
|
|
2024-04-04 06:20:10 +00:00
|
|
|
import httpx
|
|
|
|
|
2024-08-28 09:07:25 +00:00
|
|
|
from twitchdl.exceptions import ConsoleError
|
2022-08-13 09:41:13 +00:00
|
|
|
from twitchdl.progress import Progress
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
KB = 1024
|
|
|
|
|
|
|
|
CHUNK_SIZE = 256 * KB
|
|
|
|
"""How much of a VOD to download in each iteration"""
|
|
|
|
|
|
|
|
RETRY_COUNT = 5
|
|
|
|
"""Number of times to retry failed downloads before aborting."""
|
|
|
|
|
|
|
|
TIMEOUT = 30
|
|
|
|
"""
|
|
|
|
Number of seconds to wait before aborting when there is no network activity.
|
|
|
|
https://www.python-httpx.org/advanced/#timeout-configuration
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
2024-03-23 06:56:50 +00:00
|
|
|
class TokenBucket(ABC):
|
|
|
|
@abstractmethod
|
|
|
|
def advance(self, size: int):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class LimitingTokenBucket(TokenBucket):
|
2022-08-13 09:41:13 +00:00
|
|
|
"""Limit the download speed by strategically inserting sleeps."""
|
|
|
|
|
|
|
|
def __init__(self, rate: int, capacity: Optional[int] = None):
|
|
|
|
self.rate: int = rate
|
|
|
|
self.capacity: int = capacity or rate * 2
|
|
|
|
self.available: int = 0
|
|
|
|
self.last_refilled: float = time.time()
|
|
|
|
|
|
|
|
def advance(self, size: int):
|
|
|
|
"""Called every time a chunk of data is downloaded."""
|
|
|
|
self._refill()
|
|
|
|
|
|
|
|
if self.available < size:
|
|
|
|
deficit = size - self.available
|
|
|
|
time.sleep(deficit / self.rate)
|
|
|
|
|
|
|
|
self.available -= size
|
|
|
|
|
|
|
|
def _refill(self):
|
|
|
|
"""Increase available capacity according to elapsed time since last refill."""
|
|
|
|
now = time.time()
|
|
|
|
elapsed = now - self.last_refilled
|
|
|
|
refill_amount = int(elapsed * self.rate)
|
|
|
|
self.available = min(self.available + refill_amount, self.capacity)
|
|
|
|
self.last_refilled = now
|
|
|
|
|
|
|
|
|
2024-03-23 06:56:50 +00:00
|
|
|
class EndlessTokenBucket(TokenBucket):
|
2022-08-13 09:41:13 +00:00
|
|
|
"""Used when download speed is not limited."""
|
2024-04-04 06:20:10 +00:00
|
|
|
|
2022-08-20 09:35:07 +00:00
|
|
|
def advance(self, size: int):
|
2022-08-13 09:41:13 +00:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
async def download(
|
|
|
|
client: httpx.AsyncClient,
|
|
|
|
task_id: int,
|
|
|
|
source: str,
|
2024-08-28 08:59:23 +00:00
|
|
|
target: Path,
|
2022-08-13 09:41:13 +00:00
|
|
|
progress: Progress,
|
2024-03-23 06:56:50 +00:00
|
|
|
token_bucket: TokenBucket,
|
2022-08-13 09:41:13 +00:00
|
|
|
):
|
2022-08-14 09:33:23 +00:00
|
|
|
# Download to a temp file first, then copy to target when over to avoid
|
|
|
|
# getting saving chunks which may persist if canceled or --keep is used
|
|
|
|
tmp_target = f"{target}.tmp"
|
|
|
|
with open(tmp_target, "wb") as f:
|
2022-08-13 09:41:13 +00:00
|
|
|
async with client.stream("GET", source) as response:
|
|
|
|
size = int(response.headers.get("content-length"))
|
|
|
|
progress.start(task_id, size)
|
|
|
|
async for chunk in response.aiter_bytes(chunk_size=CHUNK_SIZE):
|
|
|
|
f.write(chunk)
|
|
|
|
size = len(chunk)
|
|
|
|
token_bucket.advance(size)
|
|
|
|
progress.advance(task_id, size)
|
|
|
|
progress.end(task_id)
|
2022-09-09 06:05:03 +00:00
|
|
|
os.rename(tmp_target, target)
|
2022-08-13 09:41:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
async def download_with_retries(
|
|
|
|
client: httpx.AsyncClient,
|
|
|
|
semaphore: asyncio.Semaphore,
|
|
|
|
task_id: int,
|
|
|
|
source: str,
|
2024-08-28 08:59:23 +00:00
|
|
|
target: Path,
|
2022-08-13 09:41:13 +00:00
|
|
|
progress: Progress,
|
2024-03-23 06:56:50 +00:00
|
|
|
token_bucket: TokenBucket,
|
2022-08-13 09:41:13 +00:00
|
|
|
):
|
|
|
|
async with semaphore:
|
2024-08-28 08:59:23 +00:00
|
|
|
if target.exists():
|
2022-08-14 08:02:41 +00:00
|
|
|
size = os.path.getsize(target)
|
|
|
|
progress.already_downloaded(task_id, size)
|
|
|
|
return
|
|
|
|
|
2022-08-13 09:41:13 +00:00
|
|
|
for n in range(RETRY_COUNT):
|
|
|
|
try:
|
|
|
|
return await download(client, task_id, source, target, progress, token_bucket)
|
|
|
|
except httpx.RequestError:
|
|
|
|
logger.exception("Task {task_id} failed. Retrying. Maybe.")
|
|
|
|
progress.abort(task_id)
|
|
|
|
if n + 1 >= RETRY_COUNT:
|
|
|
|
raise
|
|
|
|
|
|
|
|
raise Exception("Should not happen")
|
|
|
|
|
|
|
|
|
|
|
|
async def download_all(
|
2024-08-30 05:37:50 +00:00
|
|
|
source_targets: Iterable[Tuple[str, Path]],
|
2022-08-13 09:41:13 +00:00
|
|
|
workers: int,
|
2022-11-19 23:59:05 +00:00
|
|
|
*,
|
2024-08-30 05:37:50 +00:00
|
|
|
count: Optional[int] = None,
|
2024-04-04 06:20:10 +00:00
|
|
|
rate_limit: Optional[int] = None,
|
2022-08-13 09:41:13 +00:00
|
|
|
):
|
2024-08-30 05:37:50 +00:00
|
|
|
progress = Progress(count)
|
2024-03-23 06:56:50 +00:00
|
|
|
token_bucket = LimitingTokenBucket(rate_limit) if rate_limit else EndlessTokenBucket()
|
2022-08-13 09:41:13 +00:00
|
|
|
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
|
|
|
semaphore = asyncio.Semaphore(workers)
|
2024-04-04 06:20:10 +00:00
|
|
|
tasks = [
|
|
|
|
download_with_retries(
|
|
|
|
client,
|
|
|
|
semaphore,
|
|
|
|
task_id,
|
|
|
|
source,
|
|
|
|
target,
|
|
|
|
progress,
|
|
|
|
token_bucket,
|
|
|
|
)
|
2024-08-30 05:37:50 +00:00
|
|
|
for task_id, (source, target) in enumerate(source_targets)
|
2024-04-04 06:20:10 +00:00
|
|
|
]
|
2022-08-13 09:41:13 +00:00
|
|
|
await asyncio.gather(*tasks)
|
2024-08-28 09:07:25 +00:00
|
|
|
|
|
|
|
|
2024-08-28 10:38:40 +00:00
|
|
|
def download_file(url: str, target: Path, retries: int = RETRY_COUNT) -> None:
|
|
|
|
"""Download URL to given target path with retries"""
|
2024-08-28 10:33:05 +00:00
|
|
|
error_message = ""
|
2024-08-28 10:58:36 +00:00
|
|
|
for r in range(retries):
|
2024-08-28 09:07:25 +00:00
|
|
|
try:
|
2024-08-28 10:58:36 +00:00
|
|
|
retry_info = f" (retry {r})" if r > 0 else ""
|
|
|
|
logger.info(f"Downloading {url} to {target}{retry_info}")
|
2024-08-28 10:38:40 +00:00
|
|
|
return _do_download_file(url, target)
|
2024-08-28 10:33:05 +00:00
|
|
|
except httpx.HTTPStatusError as ex:
|
2024-08-28 10:58:36 +00:00
|
|
|
logger.error(ex)
|
2024-08-28 10:33:05 +00:00
|
|
|
error_message = f"Server responded with HTTP {ex.response.status_code}"
|
|
|
|
except httpx.RequestError as ex:
|
2024-08-28 10:58:36 +00:00
|
|
|
logger.error(ex)
|
2024-08-28 10:33:05 +00:00
|
|
|
error_message = str(ex)
|
2024-08-28 09:07:25 +00:00
|
|
|
|
2024-08-28 10:33:05 +00:00
|
|
|
raise ConsoleError(f"Failed downloading after {retries} attempts: {error_message}")
|
2024-08-28 09:07:25 +00:00
|
|
|
|
|
|
|
|
2024-08-28 10:38:40 +00:00
|
|
|
def _do_download_file(url: str, target: Path) -> None:
|
2024-08-28 09:07:25 +00:00
|
|
|
tmp_path = Path(str(target) + ".tmp")
|
2024-08-28 10:38:40 +00:00
|
|
|
|
2024-08-28 09:07:25 +00:00
|
|
|
with httpx.stream("GET", url, timeout=TIMEOUT, follow_redirects=True) as response:
|
|
|
|
response.raise_for_status()
|
|
|
|
with open(tmp_path, "wb") as f:
|
|
|
|
for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE):
|
|
|
|
f.write(chunk)
|
|
|
|
|
|
|
|
os.rename(tmp_path, target)
|