Edit on GitHub

model_signing.hashing

High level API for the hashing interface of model_signing library.

Hashing is used both for signing and verification and users should ensure that the same configuration is used in both cases.

The module could also be used to just hash a single model, without signing it:

This module allows setting up the hashing configuration to a single variable and then sharing it between signing and verification.

hashing_config = model_signing.hashing.Config().set_ignored_paths(
    paths=["README.md"], ignore_git_paths=True
)

signing_config = (
    model_signing.signing.Config()
    .use_elliptic_key_signer(private_key="key")
    .set_hashing_config(hashing_config)
)

verifying_config = (
    model_signing.verifying.Config()
    .use_elliptic_key_verifier(public_key="key.pub")
    .set_hashing_config(hashing_config)
)

The API defined here is stable and backwards compatible.

  1# Copyright 2024 The Sigstore Authors
  2#
  3# Licensed under the Apache License, Version 2.0 (the "License");
  4# you may not use this file except in compliance with the License.
  5# You may obtain a copy of the License at
  6#
  7#      http://www.apache.org/licenses/LICENSE-2.0
  8#
  9# Unless required by applicable law or agreed to in writing, software
 10# distributed under the License is distributed on an "AS IS" BASIS,
 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12# See the License for the specific language governing permissions and
 13# limitations under the License.
 14
 15"""High level API for the hashing interface of `model_signing` library.
 16
 17Hashing is used both for signing and verification and users should ensure that
 18the same configuration is used in both cases.
 19
 20The module could also be used to just hash a single model, without signing it:
 21
 22```python
 23model_signing.hashing.hash(model_path)
 24```
 25
 26This module allows setting up the hashing configuration to a single variable and
 27then sharing it between signing and verification.
 28
 29```python
 30hashing_config = model_signing.hashing.Config().set_ignored_paths(
 31    paths=["README.md"], ignore_git_paths=True
 32)
 33
 34signing_config = (
 35    model_signing.signing.Config()
 36    .use_elliptic_key_signer(private_key="key")
 37    .set_hashing_config(hashing_config)
 38)
 39
 40verifying_config = (
 41    model_signing.verifying.Config()
 42    .use_elliptic_key_verifier(public_key="key.pub")
 43    .set_hashing_config(hashing_config)
 44)
 45```
 46
 47The API defined here is stable and backwards compatible.
 48"""
 49
 50from collections.abc import Callable, Iterable
 51import os
 52import pathlib
 53import sys
 54from typing import Literal
 55
 56import blake3
 57
 58from model_signing import manifest
 59from model_signing._hashing import hashing
 60from model_signing._hashing import io
 61from model_signing._hashing import memory
 62from model_signing._serialization import file
 63from model_signing._serialization import file_shard
 64
 65
 66if sys.version_info >= (3, 11):
 67    from typing import Self
 68else:
 69    from typing_extensions import Self
 70
 71
 72# `TypeAlias` only exists from Python 3.10
 73# `TypeAlias` is deprecated in Python 3.12 in favor of `type`
 74from typing import TypeAlias
 75
 76
 77# Type alias to support `os.PathLike`, `str` and `bytes` objects in the API
 78# When Python 3.12 is the minimum supported version we can use `type`
 79# When Python 3.11 is the minimum supported version we can use `|`
 80PathLike: TypeAlias = str | bytes | os.PathLike
 81
 82
 83def hash(model_path: PathLike) -> manifest.Manifest:
 84    """Hashes a model using the default configuration.
 85
 86    Hashing is the shared part between signing and verification and is also
 87    expected to be the slowest component. When serializing a model, we need to
 88    spend time proportional to the model size on disk.
 89
 90    This method returns a "manifest" of the model. A manifest is a collection of
 91    every object in the model, paired with the corresponding hash. Currently, we
 92    consider an object in the model to be either a file or a shard of the file.
 93    Large models with large files will be hashed much faster when every shard is
 94    hashed in parallel, at the cost of generating a larger payload for the
 95    signature. In future releases we could support hashing individual tensors or
 96    tensor slices for further speed optimizations for very large models.
 97
 98    Args:
 99        model_path: The path to the model to hash.
100
101    Returns:
102        A manifest of the hashed model.
103    """
104    return Config().hash(model_path)
105
106
107class Config:
108    """Configuration to use when hashing models.
109
110    Hashing is the shared part between signing and verification and is also
111    expected to be the slowest component. When serializing a model, we need to
112    spend time proportional to the model size on disk.
113
114    Hashing builds a "manifest" of the model. A manifest is a collection of
115    every object in the model, paired with the corresponding hash. Currently, we
116    consider an object in the model to be either a file or a shard of the file.
117    Large models with large files will be hashed much faster when every shard is
118    hashed in parallel, at the cost of generating a larger payload for the
119    signature. In future releases we could support hashing individual tensors or
120    tensor slices for further speed optimizations for very large models.
121
122    This configuration class supports configuring the hashing granularity. By
123    default, we hash at file level granularity.
124
125    This configuration class also supports configuring the hash method used to
126    generate the hash for every object in the model. We currently support
127    SHA256, BLAKE2, and BLAKE3, with SHA256 being the default.
128
129    This configuration class also supports configuring which paths from the
130    model directory should be ignored. These are files that doesn't impact the
131    behavior of the model, or files that won't be distributed with the model. By
132    default, only files that are associated with a git repository (`.git`,
133    `.gitattributes`, `.gitignore`, etc.) are ignored.
134    """
135
136    def __init__(self):
137        """Initializes the default configuration for hashing."""
138        self._ignored_paths = frozenset()
139        self._ignore_git_paths = True
140        self.use_file_serialization()
141        self._allow_symlinks = False
142
143    def hash(
144        self,
145        model_path: PathLike,
146        *,
147        files_to_hash: Iterable[PathLike] | None = None,
148    ) -> manifest.Manifest:
149        """Hashes a model using the current configuration."""
150        # All paths in ``_ignored_paths`` are expected to be relative to the
151        # model directory. Join them to ``model_path`` and ensure they do not
152        # escape it.
153        model_path = pathlib.Path(model_path)
154        ignored_paths = []
155        for p in self._ignored_paths:
156            full = model_path / p
157            try:
158                full.relative_to(model_path)
159            except ValueError:
160                continue
161            ignored_paths.append(full)
162
163        if self._ignore_git_paths:
164            ignored_paths.extend(
165                [
166                    model_path / p
167                    for p in [
168                        ".git/",
169                        ".gitattributes",
170                        ".github/",
171                        ".gitignore",
172                    ]
173                ]
174            )
175
176        self._serializer.set_allow_symlinks(self._allow_symlinks)
177
178        return self._serializer.serialize(
179            pathlib.Path(model_path),
180            ignore_paths=ignored_paths,
181            files_to_hash=files_to_hash,
182        )
183
184    def _build_stream_hasher(
185        self,
186        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
187    ) -> hashing.StreamingHashEngine:
188        """Builds a streaming hasher from a constant string.
189
190        Args:
191            hashing_algorithm: The hashing algorithm to use.
192
193        Returns:
194            An instance of the requested hasher.
195        """
196        match hashing_algorithm:
197            case "sha256":
198                return memory.SHA256()
199            case "blake2":
200                return memory.BLAKE2()
201            case "blake3":
202                return memory.BLAKE3()
203            case _:
204                raise ValueError(
205                    f"Unsupported hashing method {hashing_algorithm}"
206                )
207
208    def _build_file_hasher_factory(
209        self,
210        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
211        chunk_size: int = 1048576,
212        max_workers: int | None = None,
213    ) -> Callable[[pathlib.Path], io.FileHasher]:
214        """Builds the hasher factory for a serialization by file.
215
216        Args:
217            hashing_algorithm: The hashing algorithm to use to hash a file.
218            chunk_size: The amount of file to read at once. Default is 1MB. A
219              special value of 0 signals to attempt to read everything in a
220              single call. This is ignored for BLAKE3.
221            max_workers: Maximum number of workers to use in parallel. Defaults
222              to the number of logical cores. Only relevant for BLAKE3.
223
224        Returns:
225            The hasher factory that should be used by the active serialization
226            method.
227        """
228        if max_workers is None:
229            max_workers = blake3.blake3.AUTO
230
231        def _factory(path: pathlib.Path) -> io.FileHasher:
232            if hashing_algorithm == "blake3":
233                return io.Blake3FileHasher(path, max_threads=max_workers)
234            hasher = self._build_stream_hasher(hashing_algorithm)
235            return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size)
236
237        return _factory
238
239    def _build_sharded_file_hasher_factory(
240        self,
241        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
242        chunk_size: int = 1048576,
243        shard_size: int = 1_000_000_000,
244    ) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]:
245        """Builds the hasher factory for a serialization by file shards.
246
247        This is not recommended for BLAKE3 because it is not necessary. BLAKE3
248        already operates in parallel.
249
250        Args:
251            hashing_algorithm: The hashing algorithm to use to hash a shard.
252            chunk_size: The amount of file to read at once. Default is 1MB. A
253              special value of 0 signals to attempt to read everything in a
254              single call.
255            shard_size: The size of a file shard. Default is 1 GB.
256
257        Returns:
258            The hasher factory that should be used by the active serialization
259            method.
260        """
261
262        def _factory(
263            path: pathlib.Path, start: int, end: int
264        ) -> io.ShardedFileHasher:
265            hasher = self._build_stream_hasher(hashing_algorithm)
266            return io.ShardedFileHasher(
267                path,
268                hasher,
269                start=start,
270                end=end,
271                chunk_size=chunk_size,
272                shard_size=shard_size,
273            )
274
275        return _factory
276
277    def use_file_serialization(
278        self,
279        *,
280        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
281        chunk_size: int = 1048576,
282        max_workers: int | None = None,
283        allow_symlinks: bool = False,
284        ignore_paths: Iterable[pathlib.Path] = frozenset(),
285    ) -> Self:
286        """Configures serialization to build a manifest of (file, hash) pairs.
287
288        The serialization method in this configuration is changed to one where
289        every file in the model is paired with its digest and a manifest
290        containing all these pairings is being built.
291
292        Args:
293            hashing_algorithm: The hashing algorithm to use to hash a file.
294            chunk_size: The amount of file to read at once. Default is 1MB. A
295              special value of 0 signals to attempt to read everything in a
296              single call. Ignored for BLAKE3.
297            max_workers: Maximum number of workers to use in parallel. Default
298              is to defer to the `concurrent.futures` library to select the best
299              value for the current machine, or the number of logical cores
300              when doing BLAKE3 hashing. When reading files off of slower
301              hardware like an HDD rather than an SSD, and using BLAKE3,
302              setting max_workers to 1 may improve performance.
303            allow_symlinks: Controls whether symbolic links are included. If a
304              symlink is present but the flag is `False` (default) the
305              serialization would raise an error.
306
307        Returns:
308            The new hashing configuration with the new serialization method.
309        """
310        self._serializer = file.Serializer(
311            self._build_file_hasher_factory(
312                hashing_algorithm, chunk_size, max_workers
313            ),
314            max_workers=max_workers,
315            allow_symlinks=allow_symlinks,
316            ignore_paths=ignore_paths,
317        )
318        return self
319
320    def use_shard_serialization(
321        self,
322        *,
323        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
324        chunk_size: int = 1048576,
325        shard_size: int = 1_000_000_000,
326        max_workers: int | None = None,
327        allow_symlinks: bool = False,
328        ignore_paths: Iterable[pathlib.Path] = frozenset(),
329    ) -> Self:
330        """Configures serialization to build a manifest of (shard, hash) pairs.
331
332        For BLAKE3 this is equivalent to not sharding. Sharding is bypassed
333        because BLAKE3 already operates in parallel. This means the chunk_size
334        and shard_size args are ignored.
335
336        The serialization method in this configuration is changed to one where
337        every file in the model is sharded in equal sized shards, every shard is
338        paired with its digest and a manifest containing all these pairings is
339        being built.
340
341        Args:
342            hashing_algorithm: The hashing algorithm to use to hash a shard.
343            chunk_size: The amount of file to read at once. Default is 1MB. A
344              special value of 0 signals to attempt to read everything in a
345              single call.
346            shard_size: The size of a file shard. Default is 1 GB.
347            max_workers: Maximum number of workers to use in parallel. Default
348              is to defer to the `concurrent.futures` library to select the best
349              value for the current machine.
350            allow_symlinks: Controls whether symbolic links are included. If a
351              symlink is present but the flag is `False` (default) the
352              serialization would raise an error.
353            ignore_paths: Paths of files to ignore.
354
355        Returns:
356            The new hashing configuration with the new serialization method.
357        """
358        if hashing_algorithm == "blake3":
359            return self.use_file_serialization(
360                hashing_algorithm=hashing_algorithm,
361                chunk_size=chunk_size,
362                max_workers=max_workers,
363                allow_symlinks=allow_symlinks,
364                ignore_paths=ignore_paths,
365            )
366
367        self._serializer = file_shard.Serializer(
368            self._build_sharded_file_hasher_factory(
369                hashing_algorithm, chunk_size, shard_size
370            ),
371            max_workers=max_workers,
372            allow_symlinks=allow_symlinks,
373            ignore_paths=ignore_paths,
374        )
375        return self
376
377    def set_ignored_paths(
378        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
379    ) -> Self:
380        """Configures the paths to be ignored during serialization of a model.
381
382        If the model is a single file, there are no paths that are ignored. If
383        the model is a directory, all paths are considered as relative to the
384        model directory, since we never look at files outside of it.
385
386        If an ignored path is a directory, serialization will ignore both the
387        path and any of its children.
388
389        Args:
390            paths: The paths to ignore.
391            ignore_git_paths: Whether to ignore git related paths (default) or
392              include them in the signature.
393
394        Returns:
395            The new hashing configuration with a new set of ignored paths.
396        """
397        # Preserve the user-provided relative paths; they are resolved against
398        # the model directory later when hashing.
399        self._ignored_paths = frozenset(pathlib.Path(p) for p in paths)
400        self._ignore_git_paths = ignore_git_paths
401        return self
402
403    def add_ignored_paths(
404        self, *, model_path: PathLike, paths: Iterable[PathLike]
405    ) -> None:
406        """Add more paths to ignore to existing set of paths.
407
408        Args:
409            model_path: The path to the model
410            paths: Additional paths to ignore. All path must be relative to
411                   the model directory.
412        """
413        newset = set(self._ignored_paths)
414        model_path = pathlib.Path(model_path)
415        for p in paths:
416            candidate = pathlib.Path(p)
417            full = model_path / candidate
418            try:
419                full.relative_to(model_path)
420            except ValueError:
421                continue
422            newset.add(candidate)
423        self._ignored_paths = newset
424
425    def set_allow_symlinks(self, allow_symlinks: bool) -> Self:
426        """Set whether following symlinks is allowed."""
427        self._allow_symlinks = allow_symlinks
428        return self
PathLike: TypeAlias = str | bytes | os.PathLike
def hash(model_path: str | bytes | os.PathLike) -> model_signing.manifest.Manifest:
 84def hash(model_path: PathLike) -> manifest.Manifest:
 85    """Hashes a model using the default configuration.
 86
 87    Hashing is the shared part between signing and verification and is also
 88    expected to be the slowest component. When serializing a model, we need to
 89    spend time proportional to the model size on disk.
 90
 91    This method returns a "manifest" of the model. A manifest is a collection of
 92    every object in the model, paired with the corresponding hash. Currently, we
 93    consider an object in the model to be either a file or a shard of the file.
 94    Large models with large files will be hashed much faster when every shard is
 95    hashed in parallel, at the cost of generating a larger payload for the
 96    signature. In future releases we could support hashing individual tensors or
 97    tensor slices for further speed optimizations for very large models.
 98
 99    Args:
100        model_path: The path to the model to hash.
101
102    Returns:
103        A manifest of the hashed model.
104    """
105    return Config().hash(model_path)

Hashes a model using the default configuration.

Hashing is the shared part between signing and verification and is also expected to be the slowest component. When serializing a model, we need to spend time proportional to the model size on disk.

This method returns a "manifest" of the model. A manifest is a collection of every object in the model, paired with the corresponding hash. Currently, we consider an object in the model to be either a file or a shard of the file. Large models with large files will be hashed much faster when every shard is hashed in parallel, at the cost of generating a larger payload for the signature. In future releases we could support hashing individual tensors or tensor slices for further speed optimizations for very large models.

Arguments:
  • model_path: The path to the model to hash.
Returns:

A manifest of the hashed model.

class Config:
108class Config:
109    """Configuration to use when hashing models.
110
111    Hashing is the shared part between signing and verification and is also
112    expected to be the slowest component. When serializing a model, we need to
113    spend time proportional to the model size on disk.
114
115    Hashing builds a "manifest" of the model. A manifest is a collection of
116    every object in the model, paired with the corresponding hash. Currently, we
117    consider an object in the model to be either a file or a shard of the file.
118    Large models with large files will be hashed much faster when every shard is
119    hashed in parallel, at the cost of generating a larger payload for the
120    signature. In future releases we could support hashing individual tensors or
121    tensor slices for further speed optimizations for very large models.
122
123    This configuration class supports configuring the hashing granularity. By
124    default, we hash at file level granularity.
125
126    This configuration class also supports configuring the hash method used to
127    generate the hash for every object in the model. We currently support
128    SHA256, BLAKE2, and BLAKE3, with SHA256 being the default.
129
130    This configuration class also supports configuring which paths from the
131    model directory should be ignored. These are files that doesn't impact the
132    behavior of the model, or files that won't be distributed with the model. By
133    default, only files that are associated with a git repository (`.git`,
134    `.gitattributes`, `.gitignore`, etc.) are ignored.
135    """
136
137    def __init__(self):
138        """Initializes the default configuration for hashing."""
139        self._ignored_paths = frozenset()
140        self._ignore_git_paths = True
141        self.use_file_serialization()
142        self._allow_symlinks = False
143
144    def hash(
145        self,
146        model_path: PathLike,
147        *,
148        files_to_hash: Iterable[PathLike] | None = None,
149    ) -> manifest.Manifest:
150        """Hashes a model using the current configuration."""
151        # All paths in ``_ignored_paths`` are expected to be relative to the
152        # model directory. Join them to ``model_path`` and ensure they do not
153        # escape it.
154        model_path = pathlib.Path(model_path)
155        ignored_paths = []
156        for p in self._ignored_paths:
157            full = model_path / p
158            try:
159                full.relative_to(model_path)
160            except ValueError:
161                continue
162            ignored_paths.append(full)
163
164        if self._ignore_git_paths:
165            ignored_paths.extend(
166                [
167                    model_path / p
168                    for p in [
169                        ".git/",
170                        ".gitattributes",
171                        ".github/",
172                        ".gitignore",
173                    ]
174                ]
175            )
176
177        self._serializer.set_allow_symlinks(self._allow_symlinks)
178
179        return self._serializer.serialize(
180            pathlib.Path(model_path),
181            ignore_paths=ignored_paths,
182            files_to_hash=files_to_hash,
183        )
184
185    def _build_stream_hasher(
186        self,
187        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
188    ) -> hashing.StreamingHashEngine:
189        """Builds a streaming hasher from a constant string.
190
191        Args:
192            hashing_algorithm: The hashing algorithm to use.
193
194        Returns:
195            An instance of the requested hasher.
196        """
197        match hashing_algorithm:
198            case "sha256":
199                return memory.SHA256()
200            case "blake2":
201                return memory.BLAKE2()
202            case "blake3":
203                return memory.BLAKE3()
204            case _:
205                raise ValueError(
206                    f"Unsupported hashing method {hashing_algorithm}"
207                )
208
209    def _build_file_hasher_factory(
210        self,
211        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
212        chunk_size: int = 1048576,
213        max_workers: int | None = None,
214    ) -> Callable[[pathlib.Path], io.FileHasher]:
215        """Builds the hasher factory for a serialization by file.
216
217        Args:
218            hashing_algorithm: The hashing algorithm to use to hash a file.
219            chunk_size: The amount of file to read at once. Default is 1MB. A
220              special value of 0 signals to attempt to read everything in a
221              single call. This is ignored for BLAKE3.
222            max_workers: Maximum number of workers to use in parallel. Defaults
223              to the number of logical cores. Only relevant for BLAKE3.
224
225        Returns:
226            The hasher factory that should be used by the active serialization
227            method.
228        """
229        if max_workers is None:
230            max_workers = blake3.blake3.AUTO
231
232        def _factory(path: pathlib.Path) -> io.FileHasher:
233            if hashing_algorithm == "blake3":
234                return io.Blake3FileHasher(path, max_threads=max_workers)
235            hasher = self._build_stream_hasher(hashing_algorithm)
236            return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size)
237
238        return _factory
239
240    def _build_sharded_file_hasher_factory(
241        self,
242        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
243        chunk_size: int = 1048576,
244        shard_size: int = 1_000_000_000,
245    ) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]:
246        """Builds the hasher factory for a serialization by file shards.
247
248        This is not recommended for BLAKE3 because it is not necessary. BLAKE3
249        already operates in parallel.
250
251        Args:
252            hashing_algorithm: The hashing algorithm to use to hash a shard.
253            chunk_size: The amount of file to read at once. Default is 1MB. A
254              special value of 0 signals to attempt to read everything in a
255              single call.
256            shard_size: The size of a file shard. Default is 1 GB.
257
258        Returns:
259            The hasher factory that should be used by the active serialization
260            method.
261        """
262
263        def _factory(
264            path: pathlib.Path, start: int, end: int
265        ) -> io.ShardedFileHasher:
266            hasher = self._build_stream_hasher(hashing_algorithm)
267            return io.ShardedFileHasher(
268                path,
269                hasher,
270                start=start,
271                end=end,
272                chunk_size=chunk_size,
273                shard_size=shard_size,
274            )
275
276        return _factory
277
278    def use_file_serialization(
279        self,
280        *,
281        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
282        chunk_size: int = 1048576,
283        max_workers: int | None = None,
284        allow_symlinks: bool = False,
285        ignore_paths: Iterable[pathlib.Path] = frozenset(),
286    ) -> Self:
287        """Configures serialization to build a manifest of (file, hash) pairs.
288
289        The serialization method in this configuration is changed to one where
290        every file in the model is paired with its digest and a manifest
291        containing all these pairings is being built.
292
293        Args:
294            hashing_algorithm: The hashing algorithm to use to hash a file.
295            chunk_size: The amount of file to read at once. Default is 1MB. A
296              special value of 0 signals to attempt to read everything in a
297              single call. Ignored for BLAKE3.
298            max_workers: Maximum number of workers to use in parallel. Default
299              is to defer to the `concurrent.futures` library to select the best
300              value for the current machine, or the number of logical cores
301              when doing BLAKE3 hashing. When reading files off of slower
302              hardware like an HDD rather than an SSD, and using BLAKE3,
303              setting max_workers to 1 may improve performance.
304            allow_symlinks: Controls whether symbolic links are included. If a
305              symlink is present but the flag is `False` (default) the
306              serialization would raise an error.
307
308        Returns:
309            The new hashing configuration with the new serialization method.
310        """
311        self._serializer = file.Serializer(
312            self._build_file_hasher_factory(
313                hashing_algorithm, chunk_size, max_workers
314            ),
315            max_workers=max_workers,
316            allow_symlinks=allow_symlinks,
317            ignore_paths=ignore_paths,
318        )
319        return self
320
321    def use_shard_serialization(
322        self,
323        *,
324        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
325        chunk_size: int = 1048576,
326        shard_size: int = 1_000_000_000,
327        max_workers: int | None = None,
328        allow_symlinks: bool = False,
329        ignore_paths: Iterable[pathlib.Path] = frozenset(),
330    ) -> Self:
331        """Configures serialization to build a manifest of (shard, hash) pairs.
332
333        For BLAKE3 this is equivalent to not sharding. Sharding is bypassed
334        because BLAKE3 already operates in parallel. This means the chunk_size
335        and shard_size args are ignored.
336
337        The serialization method in this configuration is changed to one where
338        every file in the model is sharded in equal sized shards, every shard is
339        paired with its digest and a manifest containing all these pairings is
340        being built.
341
342        Args:
343            hashing_algorithm: The hashing algorithm to use to hash a shard.
344            chunk_size: The amount of file to read at once. Default is 1MB. A
345              special value of 0 signals to attempt to read everything in a
346              single call.
347            shard_size: The size of a file shard. Default is 1 GB.
348            max_workers: Maximum number of workers to use in parallel. Default
349              is to defer to the `concurrent.futures` library to select the best
350              value for the current machine.
351            allow_symlinks: Controls whether symbolic links are included. If a
352              symlink is present but the flag is `False` (default) the
353              serialization would raise an error.
354            ignore_paths: Paths of files to ignore.
355
356        Returns:
357            The new hashing configuration with the new serialization method.
358        """
359        if hashing_algorithm == "blake3":
360            return self.use_file_serialization(
361                hashing_algorithm=hashing_algorithm,
362                chunk_size=chunk_size,
363                max_workers=max_workers,
364                allow_symlinks=allow_symlinks,
365                ignore_paths=ignore_paths,
366            )
367
368        self._serializer = file_shard.Serializer(
369            self._build_sharded_file_hasher_factory(
370                hashing_algorithm, chunk_size, shard_size
371            ),
372            max_workers=max_workers,
373            allow_symlinks=allow_symlinks,
374            ignore_paths=ignore_paths,
375        )
376        return self
377
378    def set_ignored_paths(
379        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
380    ) -> Self:
381        """Configures the paths to be ignored during serialization of a model.
382
383        If the model is a single file, there are no paths that are ignored. If
384        the model is a directory, all paths are considered as relative to the
385        model directory, since we never look at files outside of it.
386
387        If an ignored path is a directory, serialization will ignore both the
388        path and any of its children.
389
390        Args:
391            paths: The paths to ignore.
392            ignore_git_paths: Whether to ignore git related paths (default) or
393              include them in the signature.
394
395        Returns:
396            The new hashing configuration with a new set of ignored paths.
397        """
398        # Preserve the user-provided relative paths; they are resolved against
399        # the model directory later when hashing.
400        self._ignored_paths = frozenset(pathlib.Path(p) for p in paths)
401        self._ignore_git_paths = ignore_git_paths
402        return self
403
404    def add_ignored_paths(
405        self, *, model_path: PathLike, paths: Iterable[PathLike]
406    ) -> None:
407        """Add more paths to ignore to existing set of paths.
408
409        Args:
410            model_path: The path to the model
411            paths: Additional paths to ignore. All path must be relative to
412                   the model directory.
413        """
414        newset = set(self._ignored_paths)
415        model_path = pathlib.Path(model_path)
416        for p in paths:
417            candidate = pathlib.Path(p)
418            full = model_path / candidate
419            try:
420                full.relative_to(model_path)
421            except ValueError:
422                continue
423            newset.add(candidate)
424        self._ignored_paths = newset
425
426    def set_allow_symlinks(self, allow_symlinks: bool) -> Self:
427        """Set whether following symlinks is allowed."""
428        self._allow_symlinks = allow_symlinks
429        return self

Configuration to use when hashing models.

Hashing is the shared part between signing and verification and is also expected to be the slowest component. When serializing a model, we need to spend time proportional to the model size on disk.

Hashing builds a "manifest" of the model. A manifest is a collection of every object in the model, paired with the corresponding hash. Currently, we consider an object in the model to be either a file or a shard of the file. Large models with large files will be hashed much faster when every shard is hashed in parallel, at the cost of generating a larger payload for the signature. In future releases we could support hashing individual tensors or tensor slices for further speed optimizations for very large models.

This configuration class supports configuring the hashing granularity. By default, we hash at file level granularity.

This configuration class also supports configuring the hash method used to generate the hash for every object in the model. We currently support SHA256, BLAKE2, and BLAKE3, with SHA256 being the default.

This configuration class also supports configuring which paths from the model directory should be ignored. These are files that doesn't impact the behavior of the model, or files that won't be distributed with the model. By default, only files that are associated with a git repository (.git, .gitattributes, .gitignore, etc.) are ignored.

Config()
137    def __init__(self):
138        """Initializes the default configuration for hashing."""
139        self._ignored_paths = frozenset()
140        self._ignore_git_paths = True
141        self.use_file_serialization()
142        self._allow_symlinks = False

Initializes the default configuration for hashing.

def hash( self, model_path: str | bytes | os.PathLike, *, files_to_hash: Iterable[str | bytes | os.PathLike] | None = None) -> model_signing.manifest.Manifest:
144    def hash(
145        self,
146        model_path: PathLike,
147        *,
148        files_to_hash: Iterable[PathLike] | None = None,
149    ) -> manifest.Manifest:
150        """Hashes a model using the current configuration."""
151        # All paths in ``_ignored_paths`` are expected to be relative to the
152        # model directory. Join them to ``model_path`` and ensure they do not
153        # escape it.
154        model_path = pathlib.Path(model_path)
155        ignored_paths = []
156        for p in self._ignored_paths:
157            full = model_path / p
158            try:
159                full.relative_to(model_path)
160            except ValueError:
161                continue
162            ignored_paths.append(full)
163
164        if self._ignore_git_paths:
165            ignored_paths.extend(
166                [
167                    model_path / p
168                    for p in [
169                        ".git/",
170                        ".gitattributes",
171                        ".github/",
172                        ".gitignore",
173                    ]
174                ]
175            )
176
177        self._serializer.set_allow_symlinks(self._allow_symlinks)
178
179        return self._serializer.serialize(
180            pathlib.Path(model_path),
181            ignore_paths=ignored_paths,
182            files_to_hash=files_to_hash,
183        )

Hashes a model using the current configuration.

def use_file_serialization( self, *, hashing_algorithm: Literal['sha256', 'blake2', 'blake3'] = 'sha256', chunk_size: int = 1048576, max_workers: int | None = None, allow_symlinks: bool = False, ignore_paths: Iterable[pathlib.Path] = frozenset()) -> Self:
278    def use_file_serialization(
279        self,
280        *,
281        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
282        chunk_size: int = 1048576,
283        max_workers: int | None = None,
284        allow_symlinks: bool = False,
285        ignore_paths: Iterable[pathlib.Path] = frozenset(),
286    ) -> Self:
287        """Configures serialization to build a manifest of (file, hash) pairs.
288
289        The serialization method in this configuration is changed to one where
290        every file in the model is paired with its digest and a manifest
291        containing all these pairings is being built.
292
293        Args:
294            hashing_algorithm: The hashing algorithm to use to hash a file.
295            chunk_size: The amount of file to read at once. Default is 1MB. A
296              special value of 0 signals to attempt to read everything in a
297              single call. Ignored for BLAKE3.
298            max_workers: Maximum number of workers to use in parallel. Default
299              is to defer to the `concurrent.futures` library to select the best
300              value for the current machine, or the number of logical cores
301              when doing BLAKE3 hashing. When reading files off of slower
302              hardware like an HDD rather than an SSD, and using BLAKE3,
303              setting max_workers to 1 may improve performance.
304            allow_symlinks: Controls whether symbolic links are included. If a
305              symlink is present but the flag is `False` (default) the
306              serialization would raise an error.
307
308        Returns:
309            The new hashing configuration with the new serialization method.
310        """
311        self._serializer = file.Serializer(
312            self._build_file_hasher_factory(
313                hashing_algorithm, chunk_size, max_workers
314            ),
315            max_workers=max_workers,
316            allow_symlinks=allow_symlinks,
317            ignore_paths=ignore_paths,
318        )
319        return self

Configures serialization to build a manifest of (file, hash) pairs.

The serialization method in this configuration is changed to one where every file in the model is paired with its digest and a manifest containing all these pairings is being built.

Arguments:
  • hashing_algorithm: The hashing algorithm to use to hash a file.
  • chunk_size: The amount of file to read at once. Default is 1MB. A special value of 0 signals to attempt to read everything in a single call. Ignored for BLAKE3.
  • max_workers: Maximum number of workers to use in parallel. Default is to defer to the concurrent.futures library to select the best value for the current machine, or the number of logical cores when doing BLAKE3 hashing. When reading files off of slower hardware like an HDD rather than an SSD, and using BLAKE3, setting max_workers to 1 may improve performance.
  • allow_symlinks: Controls whether symbolic links are included. If a symlink is present but the flag is False (default) the serialization would raise an error.
Returns:

The new hashing configuration with the new serialization method.

def use_shard_serialization( self, *, hashing_algorithm: Literal['sha256', 'blake2', 'blake3'] = 'sha256', chunk_size: int = 1048576, shard_size: int = 1000000000, max_workers: int | None = None, allow_symlinks: bool = False, ignore_paths: Iterable[pathlib.Path] = frozenset()) -> Self:
321    def use_shard_serialization(
322        self,
323        *,
324        hashing_algorithm: Literal["sha256", "blake2", "blake3"] = "sha256",
325        chunk_size: int = 1048576,
326        shard_size: int = 1_000_000_000,
327        max_workers: int | None = None,
328        allow_symlinks: bool = False,
329        ignore_paths: Iterable[pathlib.Path] = frozenset(),
330    ) -> Self:
331        """Configures serialization to build a manifest of (shard, hash) pairs.
332
333        For BLAKE3 this is equivalent to not sharding. Sharding is bypassed
334        because BLAKE3 already operates in parallel. This means the chunk_size
335        and shard_size args are ignored.
336
337        The serialization method in this configuration is changed to one where
338        every file in the model is sharded in equal sized shards, every shard is
339        paired with its digest and a manifest containing all these pairings is
340        being built.
341
342        Args:
343            hashing_algorithm: The hashing algorithm to use to hash a shard.
344            chunk_size: The amount of file to read at once. Default is 1MB. A
345              special value of 0 signals to attempt to read everything in a
346              single call.
347            shard_size: The size of a file shard. Default is 1 GB.
348            max_workers: Maximum number of workers to use in parallel. Default
349              is to defer to the `concurrent.futures` library to select the best
350              value for the current machine.
351            allow_symlinks: Controls whether symbolic links are included. If a
352              symlink is present but the flag is `False` (default) the
353              serialization would raise an error.
354            ignore_paths: Paths of files to ignore.
355
356        Returns:
357            The new hashing configuration with the new serialization method.
358        """
359        if hashing_algorithm == "blake3":
360            return self.use_file_serialization(
361                hashing_algorithm=hashing_algorithm,
362                chunk_size=chunk_size,
363                max_workers=max_workers,
364                allow_symlinks=allow_symlinks,
365                ignore_paths=ignore_paths,
366            )
367
368        self._serializer = file_shard.Serializer(
369            self._build_sharded_file_hasher_factory(
370                hashing_algorithm, chunk_size, shard_size
371            ),
372            max_workers=max_workers,
373            allow_symlinks=allow_symlinks,
374            ignore_paths=ignore_paths,
375        )
376        return self

Configures serialization to build a manifest of (shard, hash) pairs.

For BLAKE3 this is equivalent to not sharding. Sharding is bypassed because BLAKE3 already operates in parallel. This means the chunk_size and shard_size args are ignored.

The serialization method in this configuration is changed to one where every file in the model is sharded in equal sized shards, every shard is paired with its digest and a manifest containing all these pairings is being built.

Arguments:
  • hashing_algorithm: The hashing algorithm to use to hash a shard.
  • chunk_size: The amount of file to read at once. Default is 1MB. A special value of 0 signals to attempt to read everything in a single call.
  • shard_size: The size of a file shard. Default is 1 GB.
  • max_workers: Maximum number of workers to use in parallel. Default is to defer to the concurrent.futures library to select the best value for the current machine.
  • allow_symlinks: Controls whether symbolic links are included. If a symlink is present but the flag is False (default) the serialization would raise an error.
  • ignore_paths: Paths of files to ignore.
Returns:

The new hashing configuration with the new serialization method.

def set_ignored_paths( self, *, paths: Iterable[str | bytes | os.PathLike], ignore_git_paths: bool = True) -> Self:
378    def set_ignored_paths(
379        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
380    ) -> Self:
381        """Configures the paths to be ignored during serialization of a model.
382
383        If the model is a single file, there are no paths that are ignored. If
384        the model is a directory, all paths are considered as relative to the
385        model directory, since we never look at files outside of it.
386
387        If an ignored path is a directory, serialization will ignore both the
388        path and any of its children.
389
390        Args:
391            paths: The paths to ignore.
392            ignore_git_paths: Whether to ignore git related paths (default) or
393              include them in the signature.
394
395        Returns:
396            The new hashing configuration with a new set of ignored paths.
397        """
398        # Preserve the user-provided relative paths; they are resolved against
399        # the model directory later when hashing.
400        self._ignored_paths = frozenset(pathlib.Path(p) for p in paths)
401        self._ignore_git_paths = ignore_git_paths
402        return self

Configures the paths to be ignored during serialization of a model.

If the model is a single file, there are no paths that are ignored. If the model is a directory, all paths are considered as relative to the model directory, since we never look at files outside of it.

If an ignored path is a directory, serialization will ignore both the path and any of its children.

Arguments:
  • paths: The paths to ignore.
  • ignore_git_paths: Whether to ignore git related paths (default) or include them in the signature.
Returns:

The new hashing configuration with a new set of ignored paths.

def add_ignored_paths( self, *, model_path: str | bytes | os.PathLike, paths: Iterable[str | bytes | os.PathLike]) -> None:
404    def add_ignored_paths(
405        self, *, model_path: PathLike, paths: Iterable[PathLike]
406    ) -> None:
407        """Add more paths to ignore to existing set of paths.
408
409        Args:
410            model_path: The path to the model
411            paths: Additional paths to ignore. All path must be relative to
412                   the model directory.
413        """
414        newset = set(self._ignored_paths)
415        model_path = pathlib.Path(model_path)
416        for p in paths:
417            candidate = pathlib.Path(p)
418            full = model_path / candidate
419            try:
420                full.relative_to(model_path)
421            except ValueError:
422                continue
423            newset.add(candidate)
424        self._ignored_paths = newset

Add more paths to ignore to existing set of paths.

Arguments:
  • model_path: The path to the model
  • paths: Additional paths to ignore. All path must be relative to the model directory.