diff --git a/lazip.py b/lazip.py index eb8f87b..a96428b 100644 --- a/lazip.py +++ b/lazip.py @@ -14,17 +14,19 @@ # GNU Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public License -# along with palace. If not, see . +# along with lazip. If not, see . """Lazy ZIP over HTTP""" -__version__ = '0.0.2' -__all__ = ['Lazip'] +__version__ = '0.0.3' +__all__ = ['Filazy', 'Lazip'] +from abc import abstractmethod from bisect import bisect_left, bisect_right from contextlib import contextmanager +from io import UnsupportedOperation from tempfile import NamedTemporaryFile -from typing import Any, Dict, Iterator, List, Optional, Tuple +from typing import IO, Dict, Iterator, List, Optional, Tuple from zipfile import BadZipFile, ZipFile from requests import Session @@ -41,45 +43,166 @@ def init_range(stop: int, size: int) -> Iterator[Tuple[int, int]]: yield 0, stop-1 -class Lazip: - """File-like object mapped to a ZIP file over HTTP. +class ReadOnlyBinaryIOWrapper(IO[bytes]): + """Wrapper for a read-only binary I/O.""" - This uses HTTP range requests to lazily fetch the file's content, - which is supposed to be fed to ZipFile. - """ + file: IO[bytes] + length: int - def __init__(self, session: Session, url: str, - chunk_size: int = CONTENT_CHUNK_SIZE) -> None: - head = session.head(url) - head.raise_for_status() - assert head.status_code == 200 - self.session, self.url, self.chunk_size = session, url, chunk_size - self.length = int(head.headers['Content-Length']) - self.file = NamedTemporaryFile() - self.file.truncate(self.length) - self.left: List[int] = [] - self.right: List[int] = [] - self.check_zip('bytes' in head.headers.get('Accept-Ranges', 'none')) - - def __enter__(self) -> 'Lazip': - self.file.__enter__() - return self - - def __exit__(self, *exc: Any) -> Optional[bool]: - return self.file.__exit__(*exc) + @property + def mode(self) -> str: + """Opening mode, which is always w+b.""" + return self.file.mode @property def name(self) -> str: """File name.""" return self.file.name + def close(self) -> None: + """Close the file.""" + self.file.close() + + @property + def closed(self) -> bool: + """Whether the file is closed.""" + return self.file.closed + + def fileno(self) -> int: + """Return the underlying file descriptor (an integer).""" + return self.file.fileno() + + def flush(self) -> None: + """Do nothing.""" + self.file.flush() + + def isatty(self) -> bool: + """Return False.""" + return self.file.isatty() + + def read(self, size: int = -1) -> bytes: + """Read up to size bytes from the object and return them. + + As a convenience, if size is unspecified or -1, + all bytes until EOF are returned. Fewer than + size bytes may be returned if EOF is reached. + """ + start = self.tell() + stop = start + size if 0 <= size <= self.length-start else self.length + self.ensure(start, stop-1) + return self.file.read(size) + + def readable(self) -> bool: + """Return True.""" + return self.file.readable() + + def readline(self, limit): + raise UnsupportedOperation + + def readlines(self, hint): + raise UnsupportedOperation + + def seek(self, offset: int, whence: int = 0) -> int: + """Change stream position and return the new absolute position. + + Seek to offset relative position indicated by whence: + * 0: Start of stream (the default). pos should be >= 0; + * 1: Current position - pos may be negative; + * 2: End of stream - pos usually negative. + """ + return self.file.seek(offset, whence) + def seekable(self) -> bool: """Return whether random access is supported, which is True.""" - return True + return self.file.seekable() + + def tell(self) -> int: + """Return the current possition.""" + return self.file.tell() + + def truncate(self, size: Optional[int] = None) -> int: + """Resize the stream to the given size in bytes. + + If size is unspecified resize to the current position. + The current stream position isn't changed. + + Return the new file size. + """ + return self.file.truncate(size) + + def writable(self) -> bool: + """Return False.""" + return False + + def write(self, s): + raise UnsupportedOperation + + def writelines(self, lines): + raise UnsupportedOperation + + def __next__(self): + raise UnsupportedOperation + + def __iter__(self): + raise UnsupportedOperation + + def __enter__(self) -> 'ReadOnlyBinaryIOWrapper': + self.file.__enter__() + return self + + def __exit__(self, *exc) -> Optional[bool]: + return self.file.__exit__(*exc) + + @abstractmethod + def ensure(self, start: int, end: int) -> None: + """Ensure the data from start to end inclusively. + + This method must return to the original position + if seek is called. + """ + + +class Filazy(ReadOnlyBinaryIOWrapper): + """Read-only file-like object mapped to a file over HTTP. + + This uses HTTP range requests to lazily fetch the file's content. + At the end of initialization, __post_init__ will be called. + + Parameters: + session (Session): Requests session + url (str): HTTP URL to the file + chunk_size (int): Download chunk size + + Attributes: + session (Session): Requests session + url (str): HTTP URL to the file + chunk_size (int): Download chunk size + left (List[int]): Left endpoints of downloaded intervals + right (List[int]): Right endpoints of downloaded intervals + accept_ranges (bool): Whether range requests are supported + """ + + def __init__(self, session: Session, url: str, + chunk_size: int = CONTENT_CHUNK_SIZE) -> None: + response = session.head(url) + response.raise_for_status() + assert response.status_code == 200 + headers = response.headers + self.session, self.url, self.chunk_size = session, url, chunk_size + self.length = int(headers['Content-Length']) + self.file = NamedTemporaryFile() + self.truncate(self.length) + self.left: List[int] = [] + self.right: List[int] = [] + self.accept_ranges = 'bytes' in headers.get('Accept-Ranges', 'none') + with self.stay(): self.__post_init__() + + def __post_init__(self) -> None: + pass @contextmanager def stay(self) -> Iterator[None]: - """Return a context manager keeping the position. + """Return a context manager that keeps the stream position. At the end of the block, seek back to original position. """ @@ -89,23 +212,6 @@ class Lazip: finally: self.seek(pos) - def check_zip(self, range_request: bool) -> None: - """Check and download until the file is a valid ZIP.""" - if not range_request: - end = self.length - 1 - self.download(0, end) - self.left, self.right = [0], [end] - return - for start, end in init_range(self.length, self.chunk_size): - self.download(start, end) - with self.stay(): - try: - ZipFile(self) # type: ignore - except BadZipFile: - pass - else: - break - def stream_response(self, start: int, end: int, base_headers: Dict[str, str] = {}) -> Response: """Return HTTP response to a range request from start to end.""" @@ -116,7 +222,7 @@ class Lazip: left: int, right: int) -> Iterator[Tuple[int, int]]: """Return an iterator of intervals to be fetched. - Args: + Parameters: start (int): Start of needed interval end (int): End of needed interval left (int): Index of first overlapping downloaded data @@ -131,7 +237,7 @@ class Lazip: if i <= end: yield i, end self.left[left:right], self.right[left:right] = [start], [end] - def download(self, start: int, end: int) -> None: + def ensure(self, start: int, end: int) -> None: """Download bytes from start to end inclusively.""" with self.stay(): i, j = bisect_left(self.right, start), bisect_right(self.left, end) @@ -143,32 +249,26 @@ class Lazip: decode_content=False): self.file.write(chunk) - def read(self, size: int = -1) -> bytes: - """Read up to size bytes from the object and return them. - As a convenience, if size is unspecified or -1, - all bytes until EOF are returned. Fewer than - size bytes may be returned if EOF is reached. - """ - start = self.tell() - stop = start + size if 0 <= size <= self.length-start else self.length - self.download(start, stop-1) - return self.file.read(size) +class Lazip(Filazy): + """Read-only file-like object mapped to a ZIP file over HTTP. - def seek(self, offset: int, whence: int = 0) -> int: - """Change stream position and return the new absolute position. + This uses HTTP range requests to lazily fetch the file's content, + which is supposed to be fed to ZipFile. + """ - Seek to offset relative position indicated by whence: - * 0: Start of stream (the default). pos should be >= 0; - * 1: Current position - pos may be negative; - * 2: End of stream - pos usually negative. - """ - return self.file.seek(offset, whence) - - def tell(self) -> int: - """Return the current possition.""" - return self.file.tell() - - def close(self) -> None: - """Close the file.""" - self.file.close() + def __post_init__(self) -> None: + """Check and download until the file is a valid ZIP.""" + if not self.accept_ranges: + end = self.length - 1 + self.ensure(0, end) + self.left, self.right = [0], [end] + return + for start, end in init_range(self.length, self.chunk_size): + self.ensure(start, end) + try: + ZipFile(self) + except BadZipFile: + pass + else: + break