twitch-dl/twitchdl/http.py

import asyncio
import logging
import os
import time
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Iterable, Optional, Tuple

import httpx

from twitchdl.exceptions import ConsoleError
from twitchdl.progress import Progress

logger = logging.getLogger(__name__)

KB = 1024

CHUNK_SIZE = 256 * KB
"""How much of a VOD to download in each iteration"""

RETRY_COUNT = 5
"""Number of times to retry failed downloads before aborting."""

TIMEOUT = 30
"""
Number of seconds to wait before aborting when there is no network activity.
https://www.python-httpx.org/advanced/#timeout-configuration
"""


class TokenBucket(ABC):
    @abstractmethod
    def advance(self, size: int):
        pass


class LimitingTokenBucket(TokenBucket):
    """Limit the download speed by strategically inserting sleeps."""

    def __init__(self, rate: int, capacity: Optional[int] = None):
        self.rate: int = rate
        self.capacity: int = capacity or rate * 2
        self.available: int = 0
        self.last_refilled: float = time.time()

    def advance(self, size: int):
        """Called every time a chunk of data is downloaded."""
        self._refill()

        if self.available < size:
            deficit = size - self.available
            time.sleep(deficit / self.rate)

        self.available -= size

    def _refill(self):
        """Increase available capacity according to elapsed time since last refill."""
        now = time.time()
        elapsed = now - self.last_refilled
        refill_amount = int(elapsed * self.rate)
        self.available = min(self.available + refill_amount, self.capacity)
        self.last_refilled = now


class EndlessTokenBucket(TokenBucket):
    """Used when download speed is not limited."""

    def advance(self, size: int):
        pass


async def download(
    client: httpx.AsyncClient,
    task_id: int,
    source: str,
    target: Path,
    progress: Progress,
    token_bucket: TokenBucket,
):
    # Download to a temp file first, then copy to target when over to avoid
    # getting saving chunks which may persist if canceled or --keep is used
    tmp_target = f"{target}.tmp"
    with open(tmp_target, "wb") as f:
        async with client.stream("GET", source) as response:
            size = int(response.headers.get("content-length"))
            progress.start(task_id, size)
            async for chunk in response.aiter_bytes(chunk_size=CHUNK_SIZE):
                f.write(chunk)
                size = len(chunk)
                token_bucket.advance(size)
                progress.advance(task_id, size)
            progress.end(task_id)
    os.rename(tmp_target, target)


async def download_with_retries(
    client: httpx.AsyncClient,
    semaphore: asyncio.Semaphore,
    task_id: int,
    source: str,
    target: Path,
    progress: Progress,
    token_bucket: TokenBucket,
):
    async with semaphore:
        if target.exists():
            size = os.path.getsize(target)
            progress.already_downloaded(task_id, size)
            return

        for n in range(RETRY_COUNT):
            try:
                return await download(client, task_id, source, target, progress, token_bucket)
            except httpx.RequestError:
                logger.exception("Task {task_id} failed. Retrying. Maybe.")
                progress.abort(task_id)
                if n + 1 >= RETRY_COUNT:
                    raise

        raise Exception("Should not happen")


async def download_all(
    source_targets: Iterable[Tuple[str, Path]],
    workers: int,
    *,
    count: Optional[int] = None,
    rate_limit: Optional[int] = None,
):
    progress = Progress(count)
    token_bucket = LimitingTokenBucket(rate_limit) if rate_limit else EndlessTokenBucket()
    async with httpx.AsyncClient(timeout=TIMEOUT) as client:
        semaphore = asyncio.Semaphore(workers)
        tasks = [
            download_with_retries(
                client,
                semaphore,
                task_id,
                source,
                target,
                progress,
                token_bucket,
            )
            for task_id, (source, target) in enumerate(source_targets)
        ]
        await asyncio.gather(*tasks)


def download_file(url: str, target: Path, retries: int = RETRY_COUNT) -> None:
    """Download URL to given target path with retries"""
    error_message = ""
    for r in range(retries):
        try:
            retry_info = f" (retry {r})" if r > 0 else ""
            logger.info(f"Downloading {url} to {target}{retry_info}")
            return _do_download_file(url, target)
        except httpx.HTTPStatusError as ex:
            logger.error(ex)
            error_message = f"Server responded with HTTP {ex.response.status_code}"
        except httpx.RequestError as ex:
            logger.error(ex)
            error_message = str(ex)

    raise ConsoleError(f"Failed downloading after {retries} attempts: {error_message}")


def _do_download_file(url: str, target: Path) -> None:
    tmp_path = Path(str(target) + ".tmp")

    with httpx.stream("GET", url, timeout=TIMEOUT, follow_redirects=True) as response:
        response.raise_for_status()
        with open(tmp_path, "wb") as f:
            for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE):
                f.write(chunk)

    os.rename(tmp_path, target)
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`import asyncio`
			`import logging`
Don't download already downloaded files 2022-08-14 08:02:41 +00:00			`import os`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`import time`
Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`from abc import ABC, abstractmethod`
Embrace pathlib 2024-08-28 08:59:23 +00:00			`from pathlib import Path`
Refactor download_all to work when count is unknown 2024-08-30 05:37:50 +00:00			`from typing import Iterable, Optional, Tuple`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00
Apply ruff formatting 2024-04-04 06:20:10 +00:00			`import httpx`

Move download_file to http 2024-08-28 09:07:25 +00:00			`from twitchdl.exceptions import ConsoleError`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`from twitchdl.progress import Progress`

			`logger = logging.getLogger(__name__)`

			`KB = 1024`

			`CHUNK_SIZE = 256 * KB`
			`"""How much of a VOD to download in each iteration"""`

			`RETRY_COUNT = 5`
			`"""Number of times to retry failed downloads before aborting."""`

			`TIMEOUT = 30`
			`"""`
			`Number of seconds to wait before aborting when there is no network activity.`
			`https://www.python-httpx.org/advanced/#timeout-configuration`
			`"""`


Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`class TokenBucket(ABC):`
			`@abstractmethod`
			`def advance(self, size: int):`
			`pass`


			`class LimitingTokenBucket(TokenBucket):`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`"""Limit the download speed by strategically inserting sleeps."""`

			`def __init__(self, rate: int, capacity: Optional[int] = None):`
			`self.rate: int = rate`
			`self.capacity: int = capacity or rate * 2`
			`self.available: int = 0`
			`self.last_refilled: float = time.time()`

			`def advance(self, size: int):`
			`"""Called every time a chunk of data is downloaded."""`
			`self._refill()`

			`if self.available < size:`
			`deficit = size - self.available`
			`time.sleep(deficit / self.rate)`

			`self.available -= size`

			`def _refill(self):`
			`"""Increase available capacity according to elapsed time since last refill."""`
			`now = time.time()`
			`elapsed = now - self.last_refilled`
			`refill_amount = int(elapsed * self.rate)`
			`self.available = min(self.available + refill_amount, self.capacity)`
			`self.last_refilled = now`


Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`class EndlessTokenBucket(TokenBucket):`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`"""Used when download speed is not limited."""`
Apply ruff formatting 2024-04-04 06:20:10 +00:00
Start adding types 2022-08-20 09:35:07 +00:00			`def advance(self, size: int):`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`pass`


			`async def download(`
			`client: httpx.AsyncClient,`
			`task_id: int,`
			`source: str,`
Embrace pathlib 2024-08-28 08:59:23 +00:00			`target: Path,`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`progress: Progress,`
Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`token_bucket: TokenBucket,`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`):`
Download chunks to a temp file first 2022-08-14 09:33:23 +00:00			`# Download to a temp file first, then copy to target when over to avoid`
			`# getting saving chunks which may persist if canceled or --keep is used`
			`tmp_target = f"{target}.tmp"`
			`with open(tmp_target, "wb") as f:`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`async with client.stream("GET", source) as response:`
			`size = int(response.headers.get("content-length"))`
			`progress.start(task_id, size)`
			`async for chunk in response.aiter_bytes(chunk_size=CHUNK_SIZE):`
			`f.write(chunk)`
			`size = len(chunk)`
			`token_bucket.advance(size)`
			`progress.advance(task_id, size)`
			`progress.end(task_id)`
Don't rename the file while it's still open issue #111 2022-09-09 06:05:03 +00:00			`os.rename(tmp_target, target)`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00

			`async def download_with_retries(`
			`client: httpx.AsyncClient,`
			`semaphore: asyncio.Semaphore,`
			`task_id: int,`
			`source: str,`
Embrace pathlib 2024-08-28 08:59:23 +00:00			`target: Path,`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`progress: Progress,`
Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`token_bucket: TokenBucket,`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`):`
			`async with semaphore:`
Embrace pathlib 2024-08-28 08:59:23 +00:00			`if target.exists():`
Don't download already downloaded files 2022-08-14 08:02:41 +00:00			`size = os.path.getsize(target)`
			`progress.already_downloaded(task_id, size)`
			`return`

Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`for n in range(RETRY_COUNT):`
			`try:`
			`return await download(client, task_id, source, target, progress, token_bucket)`
			`except httpx.RequestError:`
			`logger.exception("Task {task_id} failed. Retrying. Maybe.")`
			`progress.abort(task_id)`
			`if n + 1 >= RETRY_COUNT:`
			`raise`

			`raise Exception("Should not happen")`


			`async def download_all(`
Refactor download_all to work when count is unknown 2024-08-30 05:37:50 +00:00			`source_targets: Iterable[Tuple[str, Path]],`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`workers: int,`
Removing positional-only arguments for compatibility with Python 3.7 2022-11-19 23:59:05 +00:00			`*,`
Refactor download_all to work when count is unknown 2024-08-30 05:37:50 +00:00			`count: Optional[int] = None,`
Apply ruff formatting 2024-04-04 06:20:10 +00:00			`rate_limit: Optional[int] = None,`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`):`
Refactor download_all to work when count is unknown 2024-08-30 05:37:50 +00:00			`progress = Progress(count)`
Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`token_bucket = LimitingTokenBucket(rate_limit) if rate_limit else EndlessTokenBucket()`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`async with httpx.AsyncClient(timeout=TIMEOUT) as client:`
			`semaphore = asyncio.Semaphore(workers)`
Apply ruff formatting 2024-04-04 06:20:10 +00:00			`tasks = [`
			`download_with_retries(`
			`client,`
			`semaphore,`
			`task_id,`
			`source,`
			`target,`
			`progress,`
			`token_bucket,`
			`)`
Refactor download_all to work when count is unknown 2024-08-30 05:37:50 +00:00			`for task_id, (source, target) in enumerate(source_targets)`
Apply ruff formatting 2024-04-04 06:20:10 +00:00			`]`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`await asyncio.gather(*tasks)`
Move download_file to http 2024-08-28 09:07:25 +00:00

Don't check if file exists in download_file This is done outside the function. 2024-08-28 10:38:40 +00:00			`def download_file(url: str, target: Path, retries: int = RETRY_COUNT) -> None:`
			`"""Download URL to given target path with retries"""`
Don't stop downloading if one download fails 2024-08-28 10:33:05 +00:00			`error_message = ""`
Add --verbose flag for verbose logging Don't log HTTP payloads unless --verbose flag is given. Always logging HTTP payloads tends to make the log unreadable. 2024-08-28 10:58:36 +00:00			`for r in range(retries):`
Move download_file to http 2024-08-28 09:07:25 +00:00			`try:`
Add --verbose flag for verbose logging Don't log HTTP payloads unless --verbose flag is given. Always logging HTTP payloads tends to make the log unreadable. 2024-08-28 10:58:36 +00:00			`retry_info = f" (retry {r})" if r > 0 else ""`
			`logger.info(f"Downloading {url} to {target}{retry_info}")`
Don't check if file exists in download_file This is done outside the function. 2024-08-28 10:38:40 +00:00			`return _do_download_file(url, target)`
Don't stop downloading if one download fails 2024-08-28 10:33:05 +00:00			`except httpx.HTTPStatusError as ex:`
Add --verbose flag for verbose logging Don't log HTTP payloads unless --verbose flag is given. Always logging HTTP payloads tends to make the log unreadable. 2024-08-28 10:58:36 +00:00			`logger.error(ex)`
Don't stop downloading if one download fails 2024-08-28 10:33:05 +00:00			`error_message = f"Server responded with HTTP {ex.response.status_code}"`
			`except httpx.RequestError as ex:`
Add --verbose flag for verbose logging Don't log HTTP payloads unless --verbose flag is given. Always logging HTTP payloads tends to make the log unreadable. 2024-08-28 10:58:36 +00:00			`logger.error(ex)`
Don't stop downloading if one download fails 2024-08-28 10:33:05 +00:00			`error_message = str(ex)`
Move download_file to http 2024-08-28 09:07:25 +00:00
Don't stop downloading if one download fails 2024-08-28 10:33:05 +00:00			`raise ConsoleError(f"Failed downloading after {retries} attempts: {error_message}")`
Move download_file to http 2024-08-28 09:07:25 +00:00

Don't check if file exists in download_file This is done outside the function. 2024-08-28 10:38:40 +00:00			`def _do_download_file(url: str, target: Path) -> None:`
Move download_file to http 2024-08-28 09:07:25 +00:00			`tmp_path = Path(str(target) + ".tmp")`
Don't check if file exists in download_file This is done outside the function. 2024-08-28 10:38:40 +00:00
Move download_file to http 2024-08-28 09:07:25 +00:00			`with httpx.stream("GET", url, timeout=TIMEOUT, follow_redirects=True) as response:`
			`response.raise_for_status()`
			`with open(tmp_path, "wb") as f:`
			`for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE):`
			`f.write(chunk)`

			`os.rename(tmp_path, target)`