twitch-dl/twitchdl/http.py

import asyncio
import httpx
import logging
import os
import time

from abc import ABC, abstractmethod
from typing import List, Optional

from twitchdl.progress import Progress

logger = logging.getLogger(__name__)

KB = 1024

CHUNK_SIZE = 256 * KB
"""How much of a VOD to download in each iteration"""

RETRY_COUNT = 5
"""Number of times to retry failed downloads before aborting."""

TIMEOUT = 30
"""
Number of seconds to wait before aborting when there is no network activity.
https://www.python-httpx.org/advanced/#timeout-configuration
"""


class TokenBucket(ABC):
    @abstractmethod
    def advance(self, size: int):
        pass


class LimitingTokenBucket(TokenBucket):
    """Limit the download speed by strategically inserting sleeps."""

    def __init__(self, rate: int, capacity: Optional[int] = None):
        self.rate: int = rate
        self.capacity: int = capacity or rate * 2
        self.available: int = 0
        self.last_refilled: float = time.time()

    def advance(self, size: int):
        """Called every time a chunk of data is downloaded."""
        self._refill()

        if self.available < size:
            deficit = size - self.available
            time.sleep(deficit / self.rate)

        self.available -= size

    def _refill(self):
        """Increase available capacity according to elapsed time since last refill."""
        now = time.time()
        elapsed = now - self.last_refilled
        refill_amount = int(elapsed * self.rate)
        self.available = min(self.available + refill_amount, self.capacity)
        self.last_refilled = now


class EndlessTokenBucket(TokenBucket):
    """Used when download speed is not limited."""
    def advance(self, size: int):
        pass


async def download(
    client: httpx.AsyncClient,
    task_id: int,
    source: str,
    target: str,
    progress: Progress,
    token_bucket: TokenBucket,
):
    # Download to a temp file first, then copy to target when over to avoid
    # getting saving chunks which may persist if canceled or --keep is used
    tmp_target = f"{target}.tmp"
    with open(tmp_target, "wb") as f:
        async with client.stream("GET", source) as response:
            size = int(response.headers.get("content-length"))
            progress.start(task_id, size)
            async for chunk in response.aiter_bytes(chunk_size=CHUNK_SIZE):
                f.write(chunk)
                size = len(chunk)
                token_bucket.advance(size)
                progress.advance(task_id, size)
            progress.end(task_id)
    os.rename(tmp_target, target)


async def download_with_retries(
    client: httpx.AsyncClient,
    semaphore: asyncio.Semaphore,
    task_id: int,
    source: str,
    target: str,
    progress: Progress,
    token_bucket: TokenBucket,
):
    async with semaphore:
        if os.path.exists(target):
            size = os.path.getsize(target)
            progress.already_downloaded(task_id, size)
            return

        for n in range(RETRY_COUNT):
            try:
                return await download(client, task_id, source, target, progress, token_bucket)
            except httpx.RequestError:
                logger.exception("Task {task_id} failed. Retrying. Maybe.")
                progress.abort(task_id)
                if n + 1 >= RETRY_COUNT:
                    raise

        raise Exception("Should not happen")


async def download_all(
    sources: List[str],
    targets: List[str],
    workers: int,
    *,
    rate_limit: Optional[int] = None
):
    progress = Progress(len(sources))
    token_bucket = LimitingTokenBucket(rate_limit) if rate_limit else EndlessTokenBucket()
    async with httpx.AsyncClient(timeout=TIMEOUT) as client:
        semaphore = asyncio.Semaphore(workers)
        tasks = [download_with_retries(client, semaphore, task_id, source, target, progress, token_bucket)
                 for task_id, (source, target) in enumerate(zip(sources, targets))]
        await asyncio.gather(*tasks)
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`import asyncio`
			`import httpx`
			`import logging`
Don't download already downloaded files 2022-08-14 08:02:41 +00:00			`import os`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`import time`

Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`from abc import ABC, abstractmethod`
			`from typing import List, Optional`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00
			`from twitchdl.progress import Progress`

			`logger = logging.getLogger(__name__)`

			`KB = 1024`

			`CHUNK_SIZE = 256 * KB`
			`"""How much of a VOD to download in each iteration"""`

			`RETRY_COUNT = 5`
			`"""Number of times to retry failed downloads before aborting."""`

			`TIMEOUT = 30`
			`"""`
			`Number of seconds to wait before aborting when there is no network activity.`
			`https://www.python-httpx.org/advanced/#timeout-configuration`
			`"""`


Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`class TokenBucket(ABC):`
			`@abstractmethod`
			`def advance(self, size: int):`
			`pass`


			`class LimitingTokenBucket(TokenBucket):`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`"""Limit the download speed by strategically inserting sleeps."""`

			`def __init__(self, rate: int, capacity: Optional[int] = None):`
			`self.rate: int = rate`
			`self.capacity: int = capacity or rate * 2`
			`self.available: int = 0`
			`self.last_refilled: float = time.time()`

			`def advance(self, size: int):`
			`"""Called every time a chunk of data is downloaded."""`
			`self._refill()`

			`if self.available < size:`
			`deficit = size - self.available`
			`time.sleep(deficit / self.rate)`

			`self.available -= size`

			`def _refill(self):`
			`"""Increase available capacity according to elapsed time since last refill."""`
			`now = time.time()`
			`elapsed = now - self.last_refilled`
			`refill_amount = int(elapsed * self.rate)`
			`self.available = min(self.available + refill_amount, self.capacity)`
			`self.last_refilled = now`


Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`class EndlessTokenBucket(TokenBucket):`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`"""Used when download speed is not limited."""`
Start adding types 2022-08-20 09:35:07 +00:00			`def advance(self, size: int):`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`pass`


			`async def download(`
			`client: httpx.AsyncClient,`
			`task_id: int,`
			`source: str,`
			`target: str,`
			`progress: Progress,`
Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`token_bucket: TokenBucket,`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`):`
Download chunks to a temp file first 2022-08-14 09:33:23 +00:00			`# Download to a temp file first, then copy to target when over to avoid`
			`# getting saving chunks which may persist if canceled or --keep is used`
			`tmp_target = f"{target}.tmp"`
			`with open(tmp_target, "wb") as f:`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`async with client.stream("GET", source) as response:`
			`size = int(response.headers.get("content-length"))`
			`progress.start(task_id, size)`
			`async for chunk in response.aiter_bytes(chunk_size=CHUNK_SIZE):`
			`f.write(chunk)`
			`size = len(chunk)`
			`token_bucket.advance(size)`
			`progress.advance(task_id, size)`
			`progress.end(task_id)`
Don't rename the file while it's still open issue #111 2022-09-09 06:05:03 +00:00			`os.rename(tmp_target, target)`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00

			`async def download_with_retries(`
			`client: httpx.AsyncClient,`
			`semaphore: asyncio.Semaphore,`
			`task_id: int,`
			`source: str,`
			`target: str,`
			`progress: Progress,`
Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`token_bucket: TokenBucket,`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`):`
			`async with semaphore:`
Don't download already downloaded files 2022-08-14 08:02:41 +00:00			`if os.path.exists(target):`
			`size = os.path.getsize(target)`
			`progress.already_downloaded(task_id, size)`
			`return`

Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`for n in range(RETRY_COUNT):`
			`try:`
			`return await download(client, task_id, source, target, progress, token_bucket)`
			`except httpx.RequestError:`
			`logger.exception("Task {task_id} failed. Retrying. Maybe.")`
			`progress.abort(task_id)`
			`if n + 1 >= RETRY_COUNT:`
			`raise`

			`raise Exception("Should not happen")`


			`async def download_all(`
			`sources: List[str],`
			`targets: List[str],`
			`workers: int,`
Removing positional-only arguments for compatibility with Python 3.7 2022-11-19 23:59:05 +00:00			`*,`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`rate_limit: Optional[int] = None`
			`):`
			`progress = Progress(len(sources))`
Use an abstract base class for TokenBucket 2024-03-23 06:56:50 +00:00			`token_bucket = LimitingTokenBucket(rate_limit) if rate_limit else EndlessTokenBucket()`
Add new asyncio downloader code with rate limiting 2022-08-13 09:41:13 +00:00			`async with httpx.AsyncClient(timeout=TIMEOUT) as client:`
			`semaphore = asyncio.Semaphore(workers)`
			`tasks = [download_with_retries(client, semaphore, task_id, source, target, progress, token_bucket)`
			`for task_id, (source, target) in enumerate(zip(sources, targets))]`
			`await asyncio.gather(*tasks)`