Add parallel download support to BatchDownloader

This commit is contained in:
Neil Botelho 2023-11-04 23:06:00 +05:30
parent a39cbc584d
commit 8b41c672c4
1 changed files with 23 additions and 5 deletions

View File

@ -4,6 +4,8 @@ import email.message
import logging import logging
import mimetypes import mimetypes
import os import os
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from typing import Iterable, Optional, Tuple from typing import Iterable, Optional, Tuple
from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response from pip._vendor.requests.models import CONTENT_CHUNK_SIZE, Response
@ -166,14 +168,30 @@ class BatchDownloader:
def _sequential_download( def _sequential_download(
self, link: Link, location: str, progress_bar: str self, link: Link, location: str, progress_bar: str
) -> Tuple[Link, Tuple[str, str]]: ) -> Tuple[Link, Tuple[str, str]]:
filepath, content_type = _download( filepath, content_type = _download(link, location, self._session, progress_bar)
link, location, self._session, self._progress_bar
)
return link, (filepath, content_type) return link, (filepath, content_type)
def _download_parallel(
self, links: Iterable[Link], location: str, max_workers: int
) -> Iterable[Tuple[Link, Tuple[str, str]]]:
with ThreadPoolExecutor(max_workers=max_workers) as pool:
_download_parallel = partial(
self._sequential_download, location=location, progress_bar="off"
)
results = list(pool.map(_download_parallel, links))
return results
def __call__( def __call__(
self, links: Iterable[Link], location: str self, links: Iterable[Link], location: str
) -> Iterable[Tuple[Link, Tuple[str, str]]]: ) -> Iterable[Tuple[Link, Tuple[str, str]]]:
"""Download the files given by links into location.""" """Download the files given by links into location."""
for link in links: links = list(links)
yield self._sequential_download(link, location, self._progress_bar) max_workers = self._session.parallel_downloads
if max_workers == 1 or len(links) == 1:
# TODO: set minimum number of links to perform parallel download
for link in links:
yield self._sequential_download(link, location, self._progress_bar)
else:
results = self._download_parallel(links, location, max_workers)
for result in results:
yield result