Edit on GitHub

model_signing.hashing

High level API for the hashing interface of model_signing library.

Hashing is used both for signing and verification and users should ensure that the same configuration is used in both cases.

The module could also be used to just hash a single model, without signing it:

model_signing.hashing.hash(model_path)

This module allows setting up the hashing configuration to a single variable and then sharing it between signing and verification.

hashing_config = model_signing.hashing.Config().set_ignored_paths(
    paths=["README.md"], ignore_git_paths=True
)

signing_config = (
    model_signing.signing.Config()
    .use_elliptic_key_signer(private_key="key")
    .set_hashing_config(hashing_config)
)

verifying_config = (
    model_signing.verifying.Config()
    .use_elliptic_key_verifier(public_key="key.pub")
    .set_hashing_config(hashing_config)
)

The API defined here is stable and backwards compatible.

View Source

  1# Copyright 2024 The Sigstore Authors
  2#
  3# Licensed under the Apache License, Version 2.0 (the "License");
  4# you may not use this file except in compliance with the License.
  5# You may obtain a copy of the License at
  6#
  7#      http://www.apache.org/licenses/LICENSE-2.0
  8#
  9# Unless required by applicable law or agreed to in writing, software
 10# distributed under the License is distributed on an "AS IS" BASIS,
 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12# See the License for the specific language governing permissions and
 13# limitations under the License.
 14
 15"""High level API for the hashing interface of `model_signing` library.
 16
 17Hashing is used both for signing and verification and users should ensure that
 18the same configuration is used in both cases.
 19
 20The module could also be used to just hash a single model, without signing it:
 21
 22```python
 23model_signing.hashing.hash(model_path)
 24```
 25
 26This module allows setting up the hashing configuration to a single variable and
 27then sharing it between signing and verification.
 28
 29```python
 30hashing_config = model_signing.hashing.Config().set_ignored_paths(
 31    paths=["README.md"], ignore_git_paths=True
 32)
 33
 34signing_config = (
 35    model_signing.signing.Config()
 36    .use_elliptic_key_signer(private_key="key")
 37    .set_hashing_config(hashing_config)
 38)
 39
 40verifying_config = (
 41    model_signing.verifying.Config()
 42    .use_elliptic_key_verifier(public_key="key.pub")
 43    .set_hashing_config(hashing_config)
 44)
 45```
 46
 47The API defined here is stable and backwards compatible.
 48"""
 49
 50from collections.abc import Callable, Iterable
 51import os
 52import pathlib
 53import sys
 54from typing import Literal, Optional, Union
 55
 56import blake3
 57
 58from model_signing import manifest
 59from model_signing._hashing import hashing
 60from model_signing._hashing import io
 61from model_signing._hashing import memory
 62from model_signing._serialization import file
 63from model_signing._serialization import file_shard
 64
 65
 66if sys.version_info >= (3, 11):
 67    from typing import Self
 68else:
 69    from typing_extensions import Self
 70
 71
 72# `TypeAlias` only exists from Python 3.10
 73# `TypeAlias` is deprecated in Python 3.12 in favor of `type`
 74if sys.version_info >= (3, 10):
 75    from typing import TypeAlias
 76else:
 77    from typing_extensions import TypeAlias
 78
 79
 80# Type alias to support `os.PathLike`, `str` and `bytes` objects in the API
 81# When Python 3.12 is the minimum supported version we can use `type`
 82# When Python 3.11 is the minimum supported version we can use `|`
 83PathLike: TypeAlias = Union[str, bytes, os.PathLike]
 84
 85
 86def hash(model_path: PathLike) -> manifest.Manifest:
 87    """Hashes a model using the default configuration.
 88
 89    Hashing is the shared part between signing and verification and is also
 90    expected to be the slowest component. When serializing a model, we need to
 91    spend time proportional to the model size on disk.
 92
 93    This method returns a "manifest" of the model. A manifest is a collection of
 94    every object in the model, paired with the corresponding hash. Currently, we
 95    consider an object in the model to be either a file or a shard of the file.
 96    Large models with large files will be hashed much faster when every shard is
 97    hashed in parallel, at the cost of generating a larger payload for the
 98    signature. In future releases we could support hashing individual tensors or
 99    tensor slices for further speed optimizations for very large models.
100
101    Args:
102        model_path: The path to the model to hash.
103
104    Returns:
105        A manifest of the hashed model.
106    """
107    return Config().hash(model_path)
108
109
110class Config:
111    """Configuration to use when hashing models.
112
113    Hashing is the shared part between signing and verification and is also
114    expected to be the slowest component. When serializing a model, we need to
115    spend time proportional to the model size on disk.
116
117    Hashing builds a "manifest" of the model. A manifest is a collection of
118    every object in the model, paired with the corresponding hash. Currently, we
119    consider an object in the model to be either a file or a shard of the file.
120    Large models with large files will be hashed much faster when every shard is
121    hashed in parallel, at the cost of generating a larger payload for the
122    signature. In future releases we could support hashing individual tensors or
123    tensor slices for further speed optimizations for very large models.
124
125    This configuration class supports configuring the hashing granularity. By
126    default, we hash at file level granularity.
127
128    This configuration class also supports configuring the hash method used to
129    generate the hash for every object in the model. We currently support
130    SHA256, BLAKE2, and BLAKE3, with SHA256 being the default.
131
132    This configuration class also supports configuring which paths from the
133    model directory should be ignored. These are files that doesn't impact the
134    behavior of the model, or files that won't be distributed with the model. By
135    default, only files that are associated with a git repository (`.git`,
136    `.gitattributes`, `.gitignore`, etc.) are ignored.
137    """
138
139    def __init__(self):
140        """Initializes the default configuration for hashing."""
141        self._ignored_paths = frozenset()
142        self._ignore_git_paths = True
143        self.use_file_serialization()
144        self._allow_symlinks = False
145
146    def hash(
147        self,
148        model_path: PathLike,
149        *,
150        files_to_hash: Optional[Iterable[PathLike]] = None,
151    ) -> manifest.Manifest:
152        """Hashes a model using the current configuration."""
153        # All paths in ``_ignored_paths`` are expected to be relative to the
154        # model directory. Join them to ``model_path`` and ensure they do not
155        # escape it.
156        model_path = pathlib.Path(model_path)
157        ignored_paths = []
158        for p in self._ignored_paths:
159            full = model_path / p
160            try:
161                full.relative_to(model_path)
162            except ValueError:
163                continue
164            ignored_paths.append(full)
165
166        if self._ignore_git_paths:
167            ignored_paths.extend(
168                [
169                    model_path / p
170                    for p in [
171                        ".git/",
172                        ".gitattributes",
173                        ".github/",
174                        ".gitignore",
175                    ]
176                ]
177            )
178
179        self._serializer.set_allow_symlinks(self._allow_symlinks)
180
181        return self._serializer.serialize(
182            pathlib.Path(model_path),
183            ignore_paths=ignored_paths,
184            files_to_hash=files_to_hash,
185        )
186
187    def _build_stream_hasher(
188        self,
189        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
190    ) -> hashing.StreamingHashEngine:
191        """Builds a streaming hasher from a constant string.
192
193        Args:
194            hashing_algorithm: The hashing algorithm to use.
195
196        Returns:
197            An instance of the requested hasher.
198        """
199        # TODO: Once Python 3.9 support is deprecated revert to using `match`
200        if hashing_algorithm == "sha256":
201            return memory.SHA256()
202        if hashing_algorithm == "blake2":
203            return memory.BLAKE2()
204        if hashing_algorithm == "blake3":
205            return memory.BLAKE3()
206
207        raise ValueError(f"Unsupported hashing method {hashing_algorithm}")
208
209    def _build_file_hasher_factory(
210        self,
211        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
212        chunk_size: int = 1048576,
213        max_workers: Optional[int] = None,
214    ) -> Callable[[pathlib.Path], io.FileHasher]:
215        """Builds the hasher factory for a serialization by file.
216
217        Args:
218            hashing_algorithm: The hashing algorithm to use to hash a file.
219            chunk_size: The amount of file to read at once. Default is 1MB. A
220              special value of 0 signals to attempt to read everything in a
221              single call. This is ignored for BLAKE3.
222            max_workers: Maximum number of workers to use in parallel. Defaults
223              to the number of logical cores. Only relevant for BLAKE3.
224
225        Returns:
226            The hasher factory that should be used by the active serialization
227            method.
228        """
229        if max_workers is None:
230            max_workers = blake3.blake3.AUTO
231
232        def _factory(path: pathlib.Path) -> io.FileHasher:
233            if hashing_algorithm == "blake3":
234                return io.Blake3FileHasher(path, max_threads=max_workers)
235            hasher = self._build_stream_hasher(hashing_algorithm)
236            return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size)
237
238        return _factory
239
240    def _build_sharded_file_hasher_factory(
241        self,
242        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
243        chunk_size: int = 1048576,
244        shard_size: int = 1_000_000_000,
245    ) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]:
246        """Builds the hasher factory for a serialization by file shards.
247
248        This is not recommended for BLAKE3 because it is not necessary. BLAKE3
249        already operates in parallel.
250
251        Args:
252            hashing_algorithm: The hashing algorithm to use to hash a shard.
253            chunk_size: The amount of file to read at once. Default is 1MB. A
254              special value of 0 signals to attempt to read everything in a
255              single call.
256            shard_size: The size of a file shard. Default is 1 GB.
257
258        Returns:
259            The hasher factory that should be used by the active serialization
260            method.
261        """
262
263        def _factory(
264            path: pathlib.Path, start: int, end: int
265        ) -> io.ShardedFileHasher:
266            hasher = self._build_stream_hasher(hashing_algorithm)
267            return io.ShardedFileHasher(
268                path,
269                hasher,
270                start=start,
271                end=end,
272                chunk_size=chunk_size,
273                shard_size=shard_size,
274            )
275
276        return _factory
277
278    def use_file_serialization(
279        self,
280        *,
281        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
282        chunk_size: int = 1048576,
283        max_workers: Optional[int] = None,
284        allow_symlinks: bool = False,
285        ignore_paths: Iterable[pathlib.Path] = frozenset(),
286    ) -> Self:
287        """Configures serialization to build a manifest of (file, hash) pairs.
288
289        The serialization method in this configuration is changed to one where
290        every file in the model is paired with its digest and a manifest
291        containing all these pairings is being built.
292
293        Args:
294            hashing_algorithm: The hashing algorithm to use to hash a file.
295            chunk_size: The amount of file to read at once. Default is 1MB. A
296              special value of 0 signals to attempt to read everything in a
297              single call. Ignored for BLAKE3.
298            max_workers: Maximum number of workers to use in parallel. Default
299              is to defer to the `concurrent.futures` library to select the best
300              value for the current machine, or the number of logical cores
301              when doing BLAKE3 hashing. When reading files off of slower
302              hardware like an HDD rather than an SSD, and using BLAKE3,
303              setting max_workers to 1 may improve performance.
304            allow_symlinks: Controls whether symbolic links are included. If a
305              symlink is present but the flag is `False` (default) the
306              serialization would raise an error.
307
308        Returns:
309            The new hashing configuration with the new serialization method.
310        """
311        self._serializer = file.Serializer(
312            self._build_file_hasher_factory(
313                hashing_algorithm, chunk_size, max_workers
314            ),
315            max_workers=max_workers,
316            allow_symlinks=allow_symlinks,
317            ignore_paths=ignore_paths,
318        )
319        return self
320
321    def use_shard_serialization(
322        self,
323        *,
324        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
325        chunk_size: int = 1048576,
326        shard_size: int = 1_000_000_000,
327        max_workers: Optional[int] = None,
328        allow_symlinks: bool = False,
329        ignore_paths: Iterable[pathlib.Path] = frozenset(),
330    ) -> Self:
331        """Configures serialization to build a manifest of (shard, hash) pairs.
332
333        For BLAKE3 this is equivalent to not sharding. Sharding is bypassed
334        because BLAKE3 already operates in parallel. This means the chunk_size
335        and shard_size args are ignored.
336
337        The serialization method in this configuration is changed to one where
338        every file in the model is sharded in equal sized shards, every shard is
339        paired with its digest and a manifest containing all these pairings is
340        being built.
341
342        Args:
343            hashing_algorithm: The hashing algorithm to use to hash a shard.
344            chunk_size: The amount of file to read at once. Default is 1MB. A
345              special value of 0 signals to attempt to read everything in a
346              single call.
347            shard_size: The size of a file shard. Default is 1 GB.
348            max_workers: Maximum number of workers to use in parallel. Default
349              is to defer to the `concurrent.futures` library to select the best
350              value for the current machine.
351            allow_symlinks: Controls whether symbolic links are included. If a
352              symlink is present but the flag is `False` (default) the
353              serialization would raise an error.
354            ignore_paths: Paths of files to ignore.
355
356        Returns:
357            The new hashing configuration with the new serialization method.
358        """
359        if hashing_algorithm == "blake3":
360            return self.use_file_serialization(
361                hashing_algorithm=hashing_algorithm,
362                chunk_size=chunk_size,
363                max_workers=max_workers,
364                allow_symlinks=allow_symlinks,
365                ignore_paths=ignore_paths,
366            )
367
368        self._serializer = file_shard.Serializer(
369            self._build_sharded_file_hasher_factory(
370                hashing_algorithm, chunk_size, shard_size
371            ),
372            max_workers=max_workers,
373            allow_symlinks=allow_symlinks,
374            ignore_paths=ignore_paths,
375        )
376        return self
377
378    def set_ignored_paths(
379        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
380    ) -> Self:
381        """Configures the paths to be ignored during serialization of a model.
382
383        If the model is a single file, there are no paths that are ignored. If
384        the model is a directory, all paths are considered as relative to the
385        model directory, since we never look at files outside of it.
386
387        If an ignored path is a directory, serialization will ignore both the
388        path and any of its children.
389
390        Args:
391            paths: The paths to ignore.
392            ignore_git_paths: Whether to ignore git related paths (default) or
393              include them in the signature.
394
395        Returns:
396            The new hashing configuration with a new set of ignored paths.
397        """
398        # Preserve the user-provided relative paths; they are resolved against
399        # the model directory later when hashing.
400        self._ignored_paths = frozenset(pathlib.Path(p) for p in paths)
401        self._ignore_git_paths = ignore_git_paths
402        return self
403
404    def add_ignored_paths(
405        self, *, model_path: PathLike, paths: Iterable[PathLike]
406    ) -> None:
407        """Add more paths to ignore to existing set of paths.
408
409        Args:
410            model_path: The path to the model
411            paths: Additional paths to ignore. All path must be relative to
412                   the model directory.
413        """
414        newset = set(self._ignored_paths)
415        model_path = pathlib.Path(model_path)
416        for p in paths:
417            candidate = pathlib.Path(p)
418            full = model_path / candidate
419            try:
420                full.relative_to(model_path)
421            except ValueError:
422                continue
423            newset.add(candidate)
424        self._ignored_paths = newset
425
426    def set_allow_symlinks(self, allow_symlinks: bool) -> Self:
427        """Set whether following symlinks is allowed."""
428        self._allow_symlinks = allow_symlinks
429        return self

PathLike: TypeAlias = Union[str, bytes, os.PathLike]

def hash( model_path: Union[str, bytes, os.PathLike]) -> model_signing.manifest.Manifest: View Source

 87def hash(model_path: PathLike) -> manifest.Manifest:
 88    """Hashes a model using the default configuration.
 89
 90    Hashing is the shared part between signing and verification and is also
 91    expected to be the slowest component. When serializing a model, we need to
 92    spend time proportional to the model size on disk.
 93
 94    This method returns a "manifest" of the model. A manifest is a collection of
 95    every object in the model, paired with the corresponding hash. Currently, we
 96    consider an object in the model to be either a file or a shard of the file.
 97    Large models with large files will be hashed much faster when every shard is
 98    hashed in parallel, at the cost of generating a larger payload for the
 99    signature. In future releases we could support hashing individual tensors or
100    tensor slices for further speed optimizations for very large models.
101
102    Args:
103        model_path: The path to the model to hash.
104
105    Returns:
106        A manifest of the hashed model.
107    """
108    return Config().hash(model_path)

Hashes a model using the default configuration.

Hashing is the shared part between signing and verification and is also expected to be the slowest component. When serializing a model, we need to spend time proportional to the model size on disk.

This method returns a "manifest" of the model. A manifest is a collection of every object in the model, paired with the corresponding hash. Currently, we consider an object in the model to be either a file or a shard of the file. Large models with large files will be hashed much faster when every shard is hashed in parallel, at the cost of generating a larger payload for the signature. In future releases we could support hashing individual tensors or tensor slices for further speed optimizations for very large models.

Arguments:

model_path: The path to the model to hash.

Returns:

A manifest of the hashed model.

class Config: View Source

111class Config:
112    """Configuration to use when hashing models.
113
114    Hashing is the shared part between signing and verification and is also
115    expected to be the slowest component. When serializing a model, we need to
116    spend time proportional to the model size on disk.
117
118    Hashing builds a "manifest" of the model. A manifest is a collection of
119    every object in the model, paired with the corresponding hash. Currently, we
120    consider an object in the model to be either a file or a shard of the file.
121    Large models with large files will be hashed much faster when every shard is
122    hashed in parallel, at the cost of generating a larger payload for the
123    signature. In future releases we could support hashing individual tensors or
124    tensor slices for further speed optimizations for very large models.
125
126    This configuration class supports configuring the hashing granularity. By
127    default, we hash at file level granularity.
128
129    This configuration class also supports configuring the hash method used to
130    generate the hash for every object in the model. We currently support
131    SHA256, BLAKE2, and BLAKE3, with SHA256 being the default.
132
133    This configuration class also supports configuring which paths from the
134    model directory should be ignored. These are files that doesn't impact the
135    behavior of the model, or files that won't be distributed with the model. By
136    default, only files that are associated with a git repository (`.git`,
137    `.gitattributes`, `.gitignore`, etc.) are ignored.
138    """
139
140    def __init__(self):
141        """Initializes the default configuration for hashing."""
142        self._ignored_paths = frozenset()
143        self._ignore_git_paths = True
144        self.use_file_serialization()
145        self._allow_symlinks = False
146
147    def hash(
148        self,
149        model_path: PathLike,
150        *,
151        files_to_hash: Optional[Iterable[PathLike]] = None,
152    ) -> manifest.Manifest:
153        """Hashes a model using the current configuration."""
154        # All paths in ``_ignored_paths`` are expected to be relative to the
155        # model directory. Join them to ``model_path`` and ensure they do not
156        # escape it.
157        model_path = pathlib.Path(model_path)
158        ignored_paths = []
159        for p in self._ignored_paths:
160            full = model_path / p
161            try:
162                full.relative_to(model_path)
163            except ValueError:
164                continue
165            ignored_paths.append(full)
166
167        if self._ignore_git_paths:
168            ignored_paths.extend(
169                [
170                    model_path / p
171                    for p in [
172                        ".git/",
173                        ".gitattributes",
174                        ".github/",
175                        ".gitignore",
176                    ]
177                ]
178            )
179
180        self._serializer.set_allow_symlinks(self._allow_symlinks)
181
182        return self._serializer.serialize(
183            pathlib.Path(model_path),
184            ignore_paths=ignored_paths,
185            files_to_hash=files_to_hash,
186        )
187
188    def _build_stream_hasher(
189        self,
190        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
191    ) -> hashing.StreamingHashEngine:
192        """Builds a streaming hasher from a constant string.
193
194        Args:
195            hashing_algorithm: The hashing algorithm to use.
196
197        Returns:
198            An instance of the requested hasher.
199        """
200        # TODO: Once Python 3.9 support is deprecated revert to using `match`
201        if hashing_algorithm == "sha256":
202            return memory.SHA256()
203        if hashing_algorithm == "blake2":
204            return memory.BLAKE2()
205        if hashing_algorithm == "blake3":
206            return memory.BLAKE3()
207
208        raise ValueError(f"Unsupported hashing method {hashing_algorithm}")
209
210    def _build_file_hasher_factory(
211        self,
212        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
213        chunk_size: int = 1048576,
214        max_workers: Optional[int] = None,
215    ) -> Callable[[pathlib.Path], io.FileHasher]:
216        """Builds the hasher factory for a serialization by file.
217
218        Args:
219            hashing_algorithm: The hashing algorithm to use to hash a file.
220            chunk_size: The amount of file to read at once. Default is 1MB. A
221              special value of 0 signals to attempt to read everything in a
222              single call. This is ignored for BLAKE3.
223            max_workers: Maximum number of workers to use in parallel. Defaults
224              to the number of logical cores. Only relevant for BLAKE3.
225
226        Returns:
227            The hasher factory that should be used by the active serialization
228            method.
229        """
230        if max_workers is None:
231            max_workers = blake3.blake3.AUTO
232
233        def _factory(path: pathlib.Path) -> io.FileHasher:
234            if hashing_algorithm == "blake3":
235                return io.Blake3FileHasher(path, max_threads=max_workers)
236            hasher = self._build_stream_hasher(hashing_algorithm)
237            return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size)
238
239        return _factory
240
241    def _build_sharded_file_hasher_factory(
242        self,
243        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
244        chunk_size: int = 1048576,
245        shard_size: int = 1_000_000_000,
246    ) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]:
247        """Builds the hasher factory for a serialization by file shards.
248
249        This is not recommended for BLAKE3 because it is not necessary. BLAKE3
250        already operates in parallel.
251
252        Args:
253            hashing_algorithm: The hashing algorithm to use to hash a shard.
254            chunk_size: The amount of file to read at once. Default is 1MB. A
255              special value of 0 signals to attempt to read everything in a
256              single call.
257            shard_size: The size of a file shard. Default is 1 GB.
258
259        Returns:
260            The hasher factory that should be used by the active serialization
261            method.
262        """
263
264        def _factory(
265            path: pathlib.Path, start: int, end: int
266        ) -> io.ShardedFileHasher:
267            hasher = self._build_stream_hasher(hashing_algorithm)
268            return io.ShardedFileHasher(
269                path,
270                hasher,
271                start=start,
272                end=end,
273                chunk_size=chunk_size,
274                shard_size=shard_size,
275            )
276
277        return _factory
278
279    def use_file_serialization(
280        self,
281        *,
282        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
283        chunk_size: int = 1048576,
284        max_workers: Optional[int] = None,
285        allow_symlinks: bool = False,
286        ignore_paths: Iterable[pathlib.Path] = frozenset(),
287    ) -> Self:
288        """Configures serialization to build a manifest of (file, hash) pairs.
289
290        The serialization method in this configuration is changed to one where
291        every file in the model is paired with its digest and a manifest
292        containing all these pairings is being built.
293
294        Args:
295            hashing_algorithm: The hashing algorithm to use to hash a file.
296            chunk_size: The amount of file to read at once. Default is 1MB. A
297              special value of 0 signals to attempt to read everything in a
298              single call. Ignored for BLAKE3.
299            max_workers: Maximum number of workers to use in parallel. Default
300              is to defer to the `concurrent.futures` library to select the best
301              value for the current machine, or the number of logical cores
302              when doing BLAKE3 hashing. When reading files off of slower
303              hardware like an HDD rather than an SSD, and using BLAKE3,
304              setting max_workers to 1 may improve performance.
305            allow_symlinks: Controls whether symbolic links are included. If a
306              symlink is present but the flag is `False` (default) the
307              serialization would raise an error.
308
309        Returns:
310            The new hashing configuration with the new serialization method.
311        """
312        self._serializer = file.Serializer(
313            self._build_file_hasher_factory(
314                hashing_algorithm, chunk_size, max_workers
315            ),
316            max_workers=max_workers,
317            allow_symlinks=allow_symlinks,
318            ignore_paths=ignore_paths,
319        )
320        return self
321
322    def use_shard_serialization(
323        self,
324        *,
325        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
326        chunk_size: int = 1048576,
327        shard_size: int = 1_000_000_000,
328        max_workers: Optional[int] = None,
329        allow_symlinks: bool = False,
330        ignore_paths: Iterable[pathlib.Path] = frozenset(),
331    ) -> Self:
332        """Configures serialization to build a manifest of (shard, hash) pairs.
333
334        For BLAKE3 this is equivalent to not sharding. Sharding is bypassed
335        because BLAKE3 already operates in parallel. This means the chunk_size
336        and shard_size args are ignored.
337
338        The serialization method in this configuration is changed to one where
339        every file in the model is sharded in equal sized shards, every shard is
340        paired with its digest and a manifest containing all these pairings is
341        being built.
342
343        Args:
344            hashing_algorithm: The hashing algorithm to use to hash a shard.
345            chunk_size: The amount of file to read at once. Default is 1MB. A
346              special value of 0 signals to attempt to read everything in a
347              single call.
348            shard_size: The size of a file shard. Default is 1 GB.
349            max_workers: Maximum number of workers to use in parallel. Default
350              is to defer to the `concurrent.futures` library to select the best
351              value for the current machine.
352            allow_symlinks: Controls whether symbolic links are included. If a
353              symlink is present but the flag is `False` (default) the
354              serialization would raise an error.
355            ignore_paths: Paths of files to ignore.
356
357        Returns:
358            The new hashing configuration with the new serialization method.
359        """
360        if hashing_algorithm == "blake3":
361            return self.use_file_serialization(
362                hashing_algorithm=hashing_algorithm,
363                chunk_size=chunk_size,
364                max_workers=max_workers,
365                allow_symlinks=allow_symlinks,
366                ignore_paths=ignore_paths,
367            )
368
369        self._serializer = file_shard.Serializer(
370            self._build_sharded_file_hasher_factory(
371                hashing_algorithm, chunk_size, shard_size
372            ),
373            max_workers=max_workers,
374            allow_symlinks=allow_symlinks,
375            ignore_paths=ignore_paths,
376        )
377        return self
378
379    def set_ignored_paths(
380        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
381    ) -> Self:
382        """Configures the paths to be ignored during serialization of a model.
383
384        If the model is a single file, there are no paths that are ignored. If
385        the model is a directory, all paths are considered as relative to the
386        model directory, since we never look at files outside of it.
387
388        If an ignored path is a directory, serialization will ignore both the
389        path and any of its children.
390
391        Args:
392            paths: The paths to ignore.
393            ignore_git_paths: Whether to ignore git related paths (default) or
394              include them in the signature.
395
396        Returns:
397            The new hashing configuration with a new set of ignored paths.
398        """
399        # Preserve the user-provided relative paths; they are resolved against
400        # the model directory later when hashing.
401        self._ignored_paths = frozenset(pathlib.Path(p) for p in paths)
402        self._ignore_git_paths = ignore_git_paths
403        return self
404
405    def add_ignored_paths(
406        self, *, model_path: PathLike, paths: Iterable[PathLike]
407    ) -> None:
408        """Add more paths to ignore to existing set of paths.
409
410        Args:
411            model_path: The path to the model
412            paths: Additional paths to ignore. All path must be relative to
413                   the model directory.
414        """
415        newset = set(self._ignored_paths)
416        model_path = pathlib.Path(model_path)
417        for p in paths:
418            candidate = pathlib.Path(p)
419            full = model_path / candidate
420            try:
421                full.relative_to(model_path)
422            except ValueError:
423                continue
424            newset.add(candidate)
425        self._ignored_paths = newset
426
427    def set_allow_symlinks(self, allow_symlinks: bool) -> Self:
428        """Set whether following symlinks is allowed."""
429        self._allow_symlinks = allow_symlinks
430        return self

Configuration to use when hashing models.

Hashing is the shared part between signing and verification and is also expected to be the slowest component. When serializing a model, we need to spend time proportional to the model size on disk.

Hashing builds a "manifest" of the model. A manifest is a collection of every object in the model, paired with the corresponding hash. Currently, we consider an object in the model to be either a file or a shard of the file. Large models with large files will be hashed much faster when every shard is hashed in parallel, at the cost of generating a larger payload for the signature. In future releases we could support hashing individual tensors or tensor slices for further speed optimizations for very large models.

This configuration class supports configuring the hashing granularity. By default, we hash at file level granularity.

This configuration class also supports configuring the hash method used to generate the hash for every object in the model. We currently support SHA256, BLAKE2, and BLAKE3, with SHA256 being the default.

This configuration class also supports configuring which paths from the model directory should be ignored. These are files that doesn't impact the behavior of the model, or files that won't be distributed with the model. By default, only files that are associated with a git repository (.git, .gitattributes, .gitignore, etc.) are ignored.

Config() View Source

140    def __init__(self):
141        """Initializes the default configuration for hashing."""
142        self._ignored_paths = frozenset()
143        self._ignore_git_paths = True
144        self.use_file_serialization()
145        self._allow_symlinks = False

Initializes the default configuration for hashing.

def hash( self, model_path: Union[str, bytes, os.PathLike], *, files_to_hash: Optional[Iterable[Union[str, bytes, os.PathLike]]] = None) -> model_signing.manifest.Manifest: View Source

147    def hash(
148        self,
149        model_path: PathLike,
150        *,
151        files_to_hash: Optional[Iterable[PathLike]] = None,
152    ) -> manifest.Manifest:
153        """Hashes a model using the current configuration."""
154        # All paths in ``_ignored_paths`` are expected to be relative to the
155        # model directory. Join them to ``model_path`` and ensure they do not
156        # escape it.
157        model_path = pathlib.Path(model_path)
158        ignored_paths = []
159        for p in self._ignored_paths:
160            full = model_path / p
161            try:
162                full.relative_to(model_path)
163            except ValueError:
164                continue
165            ignored_paths.append(full)
166
167        if self._ignore_git_paths:
168            ignored_paths.extend(
169                [
170                    model_path / p
171                    for p in [
172                        ".git/",
173                        ".gitattributes",
174                        ".github/",
175                        ".gitignore",
176                    ]
177                ]
178            )
179
180        self._serializer.set_allow_symlinks(self._allow_symlinks)
181
182        return self._serializer.serialize(
183            pathlib.Path(model_path),
184            ignore_paths=ignored_paths,
185            files_to_hash=files_to_hash,
186        )

Hashes a model using the current configuration.

def use_file_serialization( self, *, hashing_algorithm: Literal['sha256', 'blake2', 'blake3'] = 'sha256', chunk_size: int = 1048576, max_workers: Optional[int] = None, allow_symlinks: bool = False, ignore_paths: Iterable[pathlib.Path] = frozenset()) -> Self: View Source

279    def use_file_serialization(
280        self,
281        *,
282        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
283        chunk_size: int = 1048576,
284        max_workers: Optional[int] = None,
285        allow_symlinks: bool = False,
286        ignore_paths: Iterable[pathlib.Path] = frozenset(),
287    ) -> Self:
288        """Configures serialization to build a manifest of (file, hash) pairs.
289
290        The serialization method in this configuration is changed to one where
291        every file in the model is paired with its digest and a manifest
292        containing all these pairings is being built.
293
294        Args:
295            hashing_algorithm: The hashing algorithm to use to hash a file.
296            chunk_size: The amount of file to read at once. Default is 1MB. A
297              special value of 0 signals to attempt to read everything in a
298              single call. Ignored for BLAKE3.
299            max_workers: Maximum number of workers to use in parallel. Default
300              is to defer to the `concurrent.futures` library to select the best
301              value for the current machine, or the number of logical cores
302              when doing BLAKE3 hashing. When reading files off of slower
303              hardware like an HDD rather than an SSD, and using BLAKE3,
304              setting max_workers to 1 may improve performance.
305            allow_symlinks: Controls whether symbolic links are included. If a
306              symlink is present but the flag is `False` (default) the
307              serialization would raise an error.
308
309        Returns:
310            The new hashing configuration with the new serialization method.
311        """
312        self._serializer = file.Serializer(
313            self._build_file_hasher_factory(
314                hashing_algorithm, chunk_size, max_workers
315            ),
316            max_workers=max_workers,
317            allow_symlinks=allow_symlinks,
318            ignore_paths=ignore_paths,
319        )
320        return self

Configures serialization to build a manifest of (file, hash) pairs.

The serialization method in this configuration is changed to one where every file in the model is paired with its digest and a manifest containing all these pairings is being built.

Arguments:

hashing_algorithm: The hashing algorithm to use to hash a file.
chunk_size: The amount of file to read at once. Default is 1MB. A special value of 0 signals to attempt to read everything in a single call. Ignored for BLAKE3.
max_workers: Maximum number of workers to use in parallel. Default is to defer to the concurrent.futures library to select the best value for the current machine, or the number of logical cores when doing BLAKE3 hashing. When reading files off of slower hardware like an HDD rather than an SSD, and using BLAKE3, setting max_workers to 1 may improve performance.
allow_symlinks: Controls whether symbolic links are included. If a symlink is present but the flag is False (default) the serialization would raise an error.

Returns:

The new hashing configuration with the new serialization method.

def use_shard_serialization( self, *, hashing_algorithm: Literal['sha256', 'blake2', 'blake3'] = 'sha256', chunk_size: int = 1048576, shard_size: int = 1000000000, max_workers: Optional[int] = None, allow_symlinks: bool = False, ignore_paths: Iterable[pathlib.Path] = frozenset()) -> Self: View Source

322    def use_shard_serialization(
323        self,
324        *,
325        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
326        chunk_size: int = 1048576,
327        shard_size: int = 1_000_000_000,
328        max_workers: Optional[int] = None,
329        allow_symlinks: bool = False,
330        ignore_paths: Iterable[pathlib.Path] = frozenset(),
331    ) -> Self:
332        """Configures serialization to build a manifest of (shard, hash) pairs.
333
334        For BLAKE3 this is equivalent to not sharding. Sharding is bypassed
335        because BLAKE3 already operates in parallel. This means the chunk_size
336        and shard_size args are ignored.
337
338        The serialization method in this configuration is changed to one where
339        every file in the model is sharded in equal sized shards, every shard is
340        paired with its digest and a manifest containing all these pairings is
341        being built.
342
343        Args:
344            hashing_algorithm: The hashing algorithm to use to hash a shard.
345            chunk_size: The amount of file to read at once. Default is 1MB. A
346              special value of 0 signals to attempt to read everything in a
347              single call.
348            shard_size: The size of a file shard. Default is 1 GB.
349            max_workers: Maximum number of workers to use in parallel. Default
350              is to defer to the `concurrent.futures` library to select the best
351              value for the current machine.
352            allow_symlinks: Controls whether symbolic links are included. If a
353              symlink is present but the flag is `False` (default) the
354              serialization would raise an error.
355            ignore_paths: Paths of files to ignore.
356
357        Returns:
358            The new hashing configuration with the new serialization method.
359        """
360        if hashing_algorithm == "blake3":
361            return self.use_file_serialization(
362                hashing_algorithm=hashing_algorithm,
363                chunk_size=chunk_size,
364                max_workers=max_workers,
365                allow_symlinks=allow_symlinks,
366                ignore_paths=ignore_paths,
367            )
368
369        self._serializer = file_shard.Serializer(
370            self._build_sharded_file_hasher_factory(
371                hashing_algorithm, chunk_size, shard_size
372            ),
373            max_workers=max_workers,
374            allow_symlinks=allow_symlinks,
375            ignore_paths=ignore_paths,
376        )
377        return self

Configures serialization to build a manifest of (shard, hash) pairs.

For BLAKE3 this is equivalent to not sharding. Sharding is bypassed because BLAKE3 already operates in parallel. This means the chunk_size and shard_size args are ignored.

The serialization method in this configuration is changed to one where every file in the model is sharded in equal sized shards, every shard is paired with its digest and a manifest containing all these pairings is being built.

Arguments:

hashing_algorithm: The hashing algorithm to use to hash a shard.
chunk_size: The amount of file to read at once. Default is 1MB. A special value of 0 signals to attempt to read everything in a single call.
shard_size: The size of a file shard. Default is 1 GB.
max_workers: Maximum number of workers to use in parallel. Default is to defer to the concurrent.futures library to select the best value for the current machine.
allow_symlinks: Controls whether symbolic links are included. If a symlink is present but the flag is False (default) the serialization would raise an error.
ignore_paths: Paths of files to ignore.

Returns:

The new hashing configuration with the new serialization method.

def set_ignored_paths( self, *, paths: Iterable[typing.Union[str, bytes, os.PathLike]], ignore_git_paths: bool = True) -> Self: View Source

379    def set_ignored_paths(
380        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
381    ) -> Self:
382        """Configures the paths to be ignored during serialization of a model.
383
384        If the model is a single file, there are no paths that are ignored. If
385        the model is a directory, all paths are considered as relative to the
386        model directory, since we never look at files outside of it.
387
388        If an ignored path is a directory, serialization will ignore both the
389        path and any of its children.
390
391        Args:
392            paths: The paths to ignore.
393            ignore_git_paths: Whether to ignore git related paths (default) or
394              include them in the signature.
395
396        Returns:
397            The new hashing configuration with a new set of ignored paths.
398        """
399        # Preserve the user-provided relative paths; they are resolved against
400        # the model directory later when hashing.
401        self._ignored_paths = frozenset(pathlib.Path(p) for p in paths)
402        self._ignore_git_paths = ignore_git_paths
403        return self

Configures the paths to be ignored during serialization of a model.

If the model is a single file, there are no paths that are ignored. If the model is a directory, all paths are considered as relative to the model directory, since we never look at files outside of it.

If an ignored path is a directory, serialization will ignore both the path and any of its children.

Arguments:

paths: The paths to ignore.
ignore_git_paths: Whether to ignore git related paths (default) or include them in the signature.

Returns:

The new hashing configuration with a new set of ignored paths.

def add_ignored_paths( self, *, model_path: Union[str, bytes, os.PathLike], paths: Iterable[typing.Union[str, bytes, os.PathLike]]) -> None: View Source

405    def add_ignored_paths(
406        self, *, model_path: PathLike, paths: Iterable[PathLike]
407    ) -> None:
408        """Add more paths to ignore to existing set of paths.
409
410        Args:
411            model_path: The path to the model
412            paths: Additional paths to ignore. All path must be relative to
413                   the model directory.
414        """
415        newset = set(self._ignored_paths)
416        model_path = pathlib.Path(model_path)
417        for p in paths:
418            candidate = pathlib.Path(p)
419            full = model_path / candidate
420            try:
421                full.relative_to(model_path)
422            except ValueError:
423                continue
424            newset.add(candidate)
425        self._ignored_paths = newset

Add more paths to ignore to existing set of paths.

Arguments:

model_path: The path to the model
paths: Additional paths to ignore. All path must be relative to the model directory.

def set_allow_symlinks(self, allow_symlinks: bool) -> Self: View Source

427    def set_allow_symlinks(self, allow_symlinks: bool) -> Self:
428        """Set whether following symlinks is allowed."""
429        self._allow_symlinks = allow_symlinks
430        return self

Set whether following symlinks is allowed.