Edit on GitHub

model_signing.hashing

High level API for the hashing interface of model_signing library.

Hashing is used both for signing and verification and users should ensure that the same configuration is used in both cases.

The module could also be used to just hash a single model, without signing it:

This module allows setting up the hashing configuration to a single variable and then sharing it between signing and verification.

hashing_config = model_signing.hashing.Config().set_ignored_paths(
    paths=["README.md"], ignore_git_paths=True
)

signing_config = (
    model_signing.signing.Config()
    .use_elliptic_key_signer(private_key="key")
    .set_hashing_config(hashing_config)
)

verifying_config = (
    model_signing.verifying.Config()
    .use_elliptic_key_verifier(public_key="key.pub")
    .set_hashing_config(hashing_config)
)

The API defined here is stable and backwards compatible.

  1# Copyright 2024 The Sigstore Authors
  2#
  3# Licensed under the Apache License, Version 2.0 (the "License");
  4# you may not use this file except in compliance with the License.
  5# You may obtain a copy of the License at
  6#
  7#      http://www.apache.org/licenses/LICENSE-2.0
  8#
  9# Unless required by applicable law or agreed to in writing, software
 10# distributed under the License is distributed on an "AS IS" BASIS,
 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12# See the License for the specific language governing permissions and
 13# limitations under the License.
 14
 15"""High level API for the hashing interface of `model_signing` library.
 16
 17Hashing is used both for signing and verification and users should ensure that
 18the same configuration is used in both cases.
 19
 20The module could also be used to just hash a single model, without signing it:
 21
 22```python
 23model_signing.hashing.hash(model_path)
 24```
 25
 26This module allows setting up the hashing configuration to a single variable and
 27then sharing it between signing and verification.
 28
 29```python
 30hashing_config = model_signing.hashing.Config().set_ignored_paths(
 31    paths=["README.md"], ignore_git_paths=True
 32)
 33
 34signing_config = (
 35    model_signing.signing.Config()
 36    .use_elliptic_key_signer(private_key="key")
 37    .set_hashing_config(hashing_config)
 38)
 39
 40verifying_config = (
 41    model_signing.verifying.Config()
 42    .use_elliptic_key_verifier(public_key="key.pub")
 43    .set_hashing_config(hashing_config)
 44)
 45```
 46
 47The API defined here is stable and backwards compatible.
 48"""
 49
 50from collections.abc import Callable, Iterable
 51import os
 52import pathlib
 53import sys
 54from typing import Literal, Optional, Union
 55
 56from model_signing import manifest
 57from model_signing._hashing import hashing
 58from model_signing._hashing import io
 59from model_signing._hashing import memory
 60from model_signing._serialization import file
 61from model_signing._serialization import file_shard
 62
 63
 64if sys.version_info >= (3, 11):
 65    from typing import Self
 66else:
 67    from typing_extensions import Self
 68
 69
 70# `TypeAlias` only exists from Python 3.10
 71# `TypeAlias` is deprecated in Python 3.12 in favor of `type`
 72if sys.version_info >= (3, 10):
 73    from typing import TypeAlias
 74else:
 75    from typing_extensions import TypeAlias
 76
 77
 78# Type alias to support `os.PathLike`, `str` and `bytes` objects in the API
 79# When Python 3.12 is the minimum supported version we can use `type`
 80# When Python 3.11 is the minimum supported version we can use `|`
 81PathLike: TypeAlias = Union[str, bytes, os.PathLike]
 82
 83
 84def hash(model_path: PathLike) -> manifest.Manifest:
 85    """Hashes a model using the default configuration.
 86
 87    Hashing is the shared part between signing and verification and is also
 88    expected to be the slowest component. When serializing a model, we need to
 89    spend time proportional to the model size on disk.
 90
 91    This method returns a "manifest" of the model. A manifest is a collection of
 92    every object in the model, paired with the corresponding hash. Currently, we
 93    consider an object in the model to be either a file or a shard of the file.
 94    Large models with large files will be hashed much faster when every shard is
 95    hashed in parallel, at the cost of generating a larger payload for the
 96    signature. In future releases we could support hashing individual tensors or
 97    tensor slices for further speed optimizations for very large models.
 98
 99    Args:
100        model_path: The path to the model to hash.
101
102    Returns:
103        A manifest of the hashed model.
104    """
105    return Config().hash(model_path)
106
107
108class Config:
109    """Configuration to use when hashing models.
110
111    Hashing is the shared part between signing and verification and is also
112    expected to be the slowest component. When serializing a model, we need to
113    spend time proportional to the model size on disk.
114
115    Hashing builds a "manifest" of the model. A manifest is a collection of
116    every object in the model, paired with the corresponding hash. Currently, we
117    consider an object in the model to be either a file or a shard of the file.
118    Large models with large files will be hashed much faster when every shard is
119    hashed in parallel, at the cost of generating a larger payload for the
120    signature. In future releases we could support hashing individual tensors or
121    tensor slices for further speed optimizations for very large models.
122
123    This configuration class supports configuring the hashing granularity. By
124    default, we hash at file level granularity.
125
126    This configuration class also supports configuring the hash method used to
127    generate the hash for every object in the model. We currently support SHA256
128    and BLAKE2, with SHA256 being the default.
129
130    This configuration class also supports configuring which paths from the
131    model directory should be ignored. These are files that doesn't impact the
132    behavior of the model, or files that won't be distributed with the model. By
133    default, only files that are associated with a git repository (`.git`,
134    `.gitattributes`, `.gitignore`, etc.) are ignored.
135    """
136
137    def __init__(self):
138        """Initializes the default configuration for hashing."""
139        self._ignored_paths = frozenset()
140        self._ignore_git_paths = True
141        self.use_file_serialization()
142
143    def hash(self, model_path: PathLike) -> manifest.Manifest:
144        """Hashes a model using the current configuration."""
145        # All paths in ignored_paths must have model_path as prefix
146        ignored_paths = []
147        for p in self._ignored_paths:
148            rp = os.path.relpath(p, model_path)
149            # rp may start with "../" if it is not relative to model_path
150            if not rp.startswith("../"):
151                ignored_paths.append(pathlib.Path(os.path.join(model_path, rp)))
152
153        if self._ignore_git_paths:
154            ignored_paths.extend(
155                [
156                    os.path.join(model_path, p)
157                    for p in [
158                        ".git/",
159                        ".gitattributes",
160                        ".github/",
161                        ".gitignore",
162                    ]
163                ]
164            )
165
166        return self._serializer.serialize(
167            pathlib.Path(model_path), ignore_paths=ignored_paths
168        )
169
170    def _build_stream_hasher(
171        self, hashing_algorithm: Literal["sha256", "blake2"] = "sha256"
172    ) -> hashing.StreamingHashEngine:
173        """Builds a streaming hasher from a constant string.
174
175        Args:
176            hashing_algorithm: The hashing algorithm to use.
177
178        Returns:
179            An instance of the requested hasher.
180        """
181        # TODO: Once Python 3.9 support is deprecated revert to using `match`
182        if hashing_algorithm == "sha256":
183            return memory.SHA256()
184        if hashing_algorithm == "blake2":
185            return memory.BLAKE2()
186
187        raise ValueError(f"Unsupported hashing method {hashing_algorithm}")
188
189    def _build_file_hasher_factory(
190        self,
191        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
192        chunk_size: int = 1048576,
193    ) -> Callable[[pathlib.Path], io.SimpleFileHasher]:
194        """Builds the hasher factory for a serialization by file.
195
196        Args:
197            hashing_algorithm: The hashing algorithm to use to hash a file.
198            chunk_size: The amount of file to read at once. Default is 1MB. A
199              special value of 0 signals to attempt to read everything in a
200              single call.
201
202        Returns:
203            The hasher factory that should be used by the active serialization
204            method.
205        """
206
207        def _factory(path: pathlib.Path) -> io.SimpleFileHasher:
208            hasher = self._build_stream_hasher(hashing_algorithm)
209            return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size)
210
211        return _factory
212
213    def _build_sharded_file_hasher_factory(
214        self,
215        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
216        chunk_size: int = 1048576,
217        shard_size: int = 1_000_000_000,
218    ) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]:
219        """Builds the hasher factory for a serialization by file shards.
220
221        Args:
222            hashing_algorithm: The hashing algorithm to use to hash a shard.
223            chunk_size: The amount of file to read at once. Default is 1MB. A
224              special value of 0 signals to attempt to read everything in a
225              single call.
226            shard_size: The size of a file shard. Default is 1 GB.
227
228        Returns:
229            The hasher factory that should be used by the active serialization
230            method.
231        """
232
233        def _factory(
234            path: pathlib.Path, start: int, end: int
235        ) -> io.ShardedFileHasher:
236            hasher = self._build_stream_hasher(hashing_algorithm)
237            return io.ShardedFileHasher(
238                path,
239                hasher,
240                start=start,
241                end=end,
242                chunk_size=chunk_size,
243                shard_size=shard_size,
244            )
245
246        return _factory
247
248    def use_file_serialization(
249        self,
250        *,
251        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
252        chunk_size: int = 1048576,
253        max_workers: Optional[int] = None,
254        allow_symlinks: bool = False,
255        ignore_paths: Iterable[pathlib.Path] = frozenset(),
256    ) -> Self:
257        """Configures serialization to build a manifest of (file, hash) pairs.
258
259        The serialization method in this configuration is changed to one where
260        every file in the model is paired with its digest and a manifest
261        containing all these pairings is being built.
262
263        Args:
264            hashing_algorithm: The hashing algorithm to use to hash a file.
265            chunk_size: The amount of file to read at once. Default is 1MB. A
266              special value of 0 signals to attempt to read everything in a
267              single call.
268            max_workers: Maximum number of workers to use in parallel. Default
269              is to defer to the `concurrent.futures` library to select the best
270              value for the current machine.
271            allow_symlinks: Controls whether symbolic links are included. If a
272              symlink is present but the flag is `False` (default) the
273              serialization would raise an error.
274
275        Returns:
276            The new hashing configuration with the new serialization method.
277        """
278        self._serializer = file.Serializer(
279            self._build_file_hasher_factory(hashing_algorithm, chunk_size),
280            max_workers=max_workers,
281            allow_symlinks=allow_symlinks,
282            ignore_paths=ignore_paths,
283        )
284        return self
285
286    def use_shard_serialization(
287        self,
288        *,
289        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
290        chunk_size: int = 1048576,
291        shard_size: int = 1_000_000_000,
292        max_workers: Optional[int] = None,
293        allow_symlinks: bool = False,
294        ignore_paths: Iterable[pathlib.Path] = frozenset(),
295    ) -> Self:
296        """Configures serialization to build a manifest of (shard, hash) pairs.
297
298        The serialization method in this configuration is changed to one where
299        every file in the model is sharded in equal sized shards, every shard is
300        paired with its digest and a manifest containing all these pairings is
301        being built.
302
303        Args:
304            hashing_algorithm: The hashing algorithm to use to hash a shard.
305            chunk_size: The amount of file to read at once. Default is 1MB. A
306              special value of 0 signals to attempt to read everything in a
307              single call.
308            shard_size: The size of a file shard. Default is 1 GB.
309            max_workers: Maximum number of workers to use in parallel. Default
310              is to defer to the `concurrent.futures` library to select the best
311              value for the current machine.
312            allow_symlinks: Controls whether symbolic links are included. If a
313              symlink is present but the flag is `False` (default) the
314              serialization would raise an error.
315            ignore_paths: Paths of files to ignore.
316
317        Returns:
318            The new hashing configuration with the new serialization method.
319        """
320        self._serializer = file_shard.Serializer(
321            self._build_sharded_file_hasher_factory(
322                hashing_algorithm, chunk_size, shard_size
323            ),
324            max_workers=max_workers,
325            allow_symlinks=allow_symlinks,
326            ignore_paths=ignore_paths,
327        )
328        return self
329
330    def set_ignored_paths(
331        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
332    ) -> Self:
333        """Configures the paths to be ignored during serialization of a model.
334
335        If the model is a single file, there are no paths that are ignored. If
336        the model is a directory, all paths are considered as relative to the
337        model directory, since we never look at files outside of it.
338
339        If an ignored path is a directory, serialization will ignore both the
340        path and any of its children.
341
342        Args:
343            paths: The paths to ignore.
344            ignore_git_paths: Whether to ignore git related paths (default) or
345              include them in the signature.
346
347        Returns:
348            The new hashing configuration with a new set of ignored paths.
349        """
350        # Use relpath to possibly fix weird paths like '../a/b' -> 'b'
351        # when '../a/' is a no-op
352        self._ignored_paths = frozenset(
353            {pathlib.Path(p).resolve() for p in paths}
354        )
355        self._ignore_git_paths = ignore_git_paths
356        return self
357
358    def add_ignored_paths(
359        self, *, model_path: PathLike, paths: Iterable[PathLike]
360    ) -> None:
361        """Add more paths to ignore to existing set of paths.
362
363        Args:
364            model_path: The path to the model
365            paths: Additional paths to ignore. All path must be relative to
366                   the model directory.
367        """
368        newset = set(self._ignored_paths)
369        newset.update([os.path.join(model_path, p) for p in paths])
370        self._ignored_paths = newset
PathLike: TypeAlias = Union[str, bytes, os.PathLike]
def hash( model_path: Union[str, bytes, os.PathLike]) -> model_signing.manifest.Manifest:
 85def hash(model_path: PathLike) -> manifest.Manifest:
 86    """Hashes a model using the default configuration.
 87
 88    Hashing is the shared part between signing and verification and is also
 89    expected to be the slowest component. When serializing a model, we need to
 90    spend time proportional to the model size on disk.
 91
 92    This method returns a "manifest" of the model. A manifest is a collection of
 93    every object in the model, paired with the corresponding hash. Currently, we
 94    consider an object in the model to be either a file or a shard of the file.
 95    Large models with large files will be hashed much faster when every shard is
 96    hashed in parallel, at the cost of generating a larger payload for the
 97    signature. In future releases we could support hashing individual tensors or
 98    tensor slices for further speed optimizations for very large models.
 99
100    Args:
101        model_path: The path to the model to hash.
102
103    Returns:
104        A manifest of the hashed model.
105    """
106    return Config().hash(model_path)

Hashes a model using the default configuration.

Hashing is the shared part between signing and verification and is also expected to be the slowest component. When serializing a model, we need to spend time proportional to the model size on disk.

This method returns a "manifest" of the model. A manifest is a collection of every object in the model, paired with the corresponding hash. Currently, we consider an object in the model to be either a file or a shard of the file. Large models with large files will be hashed much faster when every shard is hashed in parallel, at the cost of generating a larger payload for the signature. In future releases we could support hashing individual tensors or tensor slices for further speed optimizations for very large models.

Arguments:
  • model_path: The path to the model to hash.
Returns:

A manifest of the hashed model.

class Config:
109class Config:
110    """Configuration to use when hashing models.
111
112    Hashing is the shared part between signing and verification and is also
113    expected to be the slowest component. When serializing a model, we need to
114    spend time proportional to the model size on disk.
115
116    Hashing builds a "manifest" of the model. A manifest is a collection of
117    every object in the model, paired with the corresponding hash. Currently, we
118    consider an object in the model to be either a file or a shard of the file.
119    Large models with large files will be hashed much faster when every shard is
120    hashed in parallel, at the cost of generating a larger payload for the
121    signature. In future releases we could support hashing individual tensors or
122    tensor slices for further speed optimizations for very large models.
123
124    This configuration class supports configuring the hashing granularity. By
125    default, we hash at file level granularity.
126
127    This configuration class also supports configuring the hash method used to
128    generate the hash for every object in the model. We currently support SHA256
129    and BLAKE2, with SHA256 being the default.
130
131    This configuration class also supports configuring which paths from the
132    model directory should be ignored. These are files that doesn't impact the
133    behavior of the model, or files that won't be distributed with the model. By
134    default, only files that are associated with a git repository (`.git`,
135    `.gitattributes`, `.gitignore`, etc.) are ignored.
136    """
137
138    def __init__(self):
139        """Initializes the default configuration for hashing."""
140        self._ignored_paths = frozenset()
141        self._ignore_git_paths = True
142        self.use_file_serialization()
143
144    def hash(self, model_path: PathLike) -> manifest.Manifest:
145        """Hashes a model using the current configuration."""
146        # All paths in ignored_paths must have model_path as prefix
147        ignored_paths = []
148        for p in self._ignored_paths:
149            rp = os.path.relpath(p, model_path)
150            # rp may start with "../" if it is not relative to model_path
151            if not rp.startswith("../"):
152                ignored_paths.append(pathlib.Path(os.path.join(model_path, rp)))
153
154        if self._ignore_git_paths:
155            ignored_paths.extend(
156                [
157                    os.path.join(model_path, p)
158                    for p in [
159                        ".git/",
160                        ".gitattributes",
161                        ".github/",
162                        ".gitignore",
163                    ]
164                ]
165            )
166
167        return self._serializer.serialize(
168            pathlib.Path(model_path), ignore_paths=ignored_paths
169        )
170
171    def _build_stream_hasher(
172        self, hashing_algorithm: Literal["sha256", "blake2"] = "sha256"
173    ) -> hashing.StreamingHashEngine:
174        """Builds a streaming hasher from a constant string.
175
176        Args:
177            hashing_algorithm: The hashing algorithm to use.
178
179        Returns:
180            An instance of the requested hasher.
181        """
182        # TODO: Once Python 3.9 support is deprecated revert to using `match`
183        if hashing_algorithm == "sha256":
184            return memory.SHA256()
185        if hashing_algorithm == "blake2":
186            return memory.BLAKE2()
187
188        raise ValueError(f"Unsupported hashing method {hashing_algorithm}")
189
190    def _build_file_hasher_factory(
191        self,
192        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
193        chunk_size: int = 1048576,
194    ) -> Callable[[pathlib.Path], io.SimpleFileHasher]:
195        """Builds the hasher factory for a serialization by file.
196
197        Args:
198            hashing_algorithm: The hashing algorithm to use to hash a file.
199            chunk_size: The amount of file to read at once. Default is 1MB. A
200              special value of 0 signals to attempt to read everything in a
201              single call.
202
203        Returns:
204            The hasher factory that should be used by the active serialization
205            method.
206        """
207
208        def _factory(path: pathlib.Path) -> io.SimpleFileHasher:
209            hasher = self._build_stream_hasher(hashing_algorithm)
210            return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size)
211
212        return _factory
213
214    def _build_sharded_file_hasher_factory(
215        self,
216        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
217        chunk_size: int = 1048576,
218        shard_size: int = 1_000_000_000,
219    ) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]:
220        """Builds the hasher factory for a serialization by file shards.
221
222        Args:
223            hashing_algorithm: The hashing algorithm to use to hash a shard.
224            chunk_size: The amount of file to read at once. Default is 1MB. A
225              special value of 0 signals to attempt to read everything in a
226              single call.
227            shard_size: The size of a file shard. Default is 1 GB.
228
229        Returns:
230            The hasher factory that should be used by the active serialization
231            method.
232        """
233
234        def _factory(
235            path: pathlib.Path, start: int, end: int
236        ) -> io.ShardedFileHasher:
237            hasher = self._build_stream_hasher(hashing_algorithm)
238            return io.ShardedFileHasher(
239                path,
240                hasher,
241                start=start,
242                end=end,
243                chunk_size=chunk_size,
244                shard_size=shard_size,
245            )
246
247        return _factory
248
249    def use_file_serialization(
250        self,
251        *,
252        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
253        chunk_size: int = 1048576,
254        max_workers: Optional[int] = None,
255        allow_symlinks: bool = False,
256        ignore_paths: Iterable[pathlib.Path] = frozenset(),
257    ) -> Self:
258        """Configures serialization to build a manifest of (file, hash) pairs.
259
260        The serialization method in this configuration is changed to one where
261        every file in the model is paired with its digest and a manifest
262        containing all these pairings is being built.
263
264        Args:
265            hashing_algorithm: The hashing algorithm to use to hash a file.
266            chunk_size: The amount of file to read at once. Default is 1MB. A
267              special value of 0 signals to attempt to read everything in a
268              single call.
269            max_workers: Maximum number of workers to use in parallel. Default
270              is to defer to the `concurrent.futures` library to select the best
271              value for the current machine.
272            allow_symlinks: Controls whether symbolic links are included. If a
273              symlink is present but the flag is `False` (default) the
274              serialization would raise an error.
275
276        Returns:
277            The new hashing configuration with the new serialization method.
278        """
279        self._serializer = file.Serializer(
280            self._build_file_hasher_factory(hashing_algorithm, chunk_size),
281            max_workers=max_workers,
282            allow_symlinks=allow_symlinks,
283            ignore_paths=ignore_paths,
284        )
285        return self
286
287    def use_shard_serialization(
288        self,
289        *,
290        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
291        chunk_size: int = 1048576,
292        shard_size: int = 1_000_000_000,
293        max_workers: Optional[int] = None,
294        allow_symlinks: bool = False,
295        ignore_paths: Iterable[pathlib.Path] = frozenset(),
296    ) -> Self:
297        """Configures serialization to build a manifest of (shard, hash) pairs.
298
299        The serialization method in this configuration is changed to one where
300        every file in the model is sharded in equal sized shards, every shard is
301        paired with its digest and a manifest containing all these pairings is
302        being built.
303
304        Args:
305            hashing_algorithm: The hashing algorithm to use to hash a shard.
306            chunk_size: The amount of file to read at once. Default is 1MB. A
307              special value of 0 signals to attempt to read everything in a
308              single call.
309            shard_size: The size of a file shard. Default is 1 GB.
310            max_workers: Maximum number of workers to use in parallel. Default
311              is to defer to the `concurrent.futures` library to select the best
312              value for the current machine.
313            allow_symlinks: Controls whether symbolic links are included. If a
314              symlink is present but the flag is `False` (default) the
315              serialization would raise an error.
316            ignore_paths: Paths of files to ignore.
317
318        Returns:
319            The new hashing configuration with the new serialization method.
320        """
321        self._serializer = file_shard.Serializer(
322            self._build_sharded_file_hasher_factory(
323                hashing_algorithm, chunk_size, shard_size
324            ),
325            max_workers=max_workers,
326            allow_symlinks=allow_symlinks,
327            ignore_paths=ignore_paths,
328        )
329        return self
330
331    def set_ignored_paths(
332        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
333    ) -> Self:
334        """Configures the paths to be ignored during serialization of a model.
335
336        If the model is a single file, there are no paths that are ignored. If
337        the model is a directory, all paths are considered as relative to the
338        model directory, since we never look at files outside of it.
339
340        If an ignored path is a directory, serialization will ignore both the
341        path and any of its children.
342
343        Args:
344            paths: The paths to ignore.
345            ignore_git_paths: Whether to ignore git related paths (default) or
346              include them in the signature.
347
348        Returns:
349            The new hashing configuration with a new set of ignored paths.
350        """
351        # Use relpath to possibly fix weird paths like '../a/b' -> 'b'
352        # when '../a/' is a no-op
353        self._ignored_paths = frozenset(
354            {pathlib.Path(p).resolve() for p in paths}
355        )
356        self._ignore_git_paths = ignore_git_paths
357        return self
358
359    def add_ignored_paths(
360        self, *, model_path: PathLike, paths: Iterable[PathLike]
361    ) -> None:
362        """Add more paths to ignore to existing set of paths.
363
364        Args:
365            model_path: The path to the model
366            paths: Additional paths to ignore. All path must be relative to
367                   the model directory.
368        """
369        newset = set(self._ignored_paths)
370        newset.update([os.path.join(model_path, p) for p in paths])
371        self._ignored_paths = newset

Configuration to use when hashing models.

Hashing is the shared part between signing and verification and is also expected to be the slowest component. When serializing a model, we need to spend time proportional to the model size on disk.

Hashing builds a "manifest" of the model. A manifest is a collection of every object in the model, paired with the corresponding hash. Currently, we consider an object in the model to be either a file or a shard of the file. Large models with large files will be hashed much faster when every shard is hashed in parallel, at the cost of generating a larger payload for the signature. In future releases we could support hashing individual tensors or tensor slices for further speed optimizations for very large models.

This configuration class supports configuring the hashing granularity. By default, we hash at file level granularity.

This configuration class also supports configuring the hash method used to generate the hash for every object in the model. We currently support SHA256 and BLAKE2, with SHA256 being the default.

This configuration class also supports configuring which paths from the model directory should be ignored. These are files that doesn't impact the behavior of the model, or files that won't be distributed with the model. By default, only files that are associated with a git repository (.git, .gitattributes, .gitignore, etc.) are ignored.

Config()
138    def __init__(self):
139        """Initializes the default configuration for hashing."""
140        self._ignored_paths = frozenset()
141        self._ignore_git_paths = True
142        self.use_file_serialization()

Initializes the default configuration for hashing.

def hash( self, model_path: Union[str, bytes, os.PathLike]) -> model_signing.manifest.Manifest:
144    def hash(self, model_path: PathLike) -> manifest.Manifest:
145        """Hashes a model using the current configuration."""
146        # All paths in ignored_paths must have model_path as prefix
147        ignored_paths = []
148        for p in self._ignored_paths:
149            rp = os.path.relpath(p, model_path)
150            # rp may start with "../" if it is not relative to model_path
151            if not rp.startswith("../"):
152                ignored_paths.append(pathlib.Path(os.path.join(model_path, rp)))
153
154        if self._ignore_git_paths:
155            ignored_paths.extend(
156                [
157                    os.path.join(model_path, p)
158                    for p in [
159                        ".git/",
160                        ".gitattributes",
161                        ".github/",
162                        ".gitignore",
163                    ]
164                ]
165            )
166
167        return self._serializer.serialize(
168            pathlib.Path(model_path), ignore_paths=ignored_paths
169        )

Hashes a model using the current configuration.

def use_file_serialization( self, *, hashing_algorithm: Literal['sha256', 'blake2'] = 'sha256', chunk_size: int = 1048576, max_workers: Optional[int] = None, allow_symlinks: bool = False, ignore_paths: Iterable[pathlib.Path] = frozenset()) -> Self:
249    def use_file_serialization(
250        self,
251        *,
252        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
253        chunk_size: int = 1048576,
254        max_workers: Optional[int] = None,
255        allow_symlinks: bool = False,
256        ignore_paths: Iterable[pathlib.Path] = frozenset(),
257    ) -> Self:
258        """Configures serialization to build a manifest of (file, hash) pairs.
259
260        The serialization method in this configuration is changed to one where
261        every file in the model is paired with its digest and a manifest
262        containing all these pairings is being built.
263
264        Args:
265            hashing_algorithm: The hashing algorithm to use to hash a file.
266            chunk_size: The amount of file to read at once. Default is 1MB. A
267              special value of 0 signals to attempt to read everything in a
268              single call.
269            max_workers: Maximum number of workers to use in parallel. Default
270              is to defer to the `concurrent.futures` library to select the best
271              value for the current machine.
272            allow_symlinks: Controls whether symbolic links are included. If a
273              symlink is present but the flag is `False` (default) the
274              serialization would raise an error.
275
276        Returns:
277            The new hashing configuration with the new serialization method.
278        """
279        self._serializer = file.Serializer(
280            self._build_file_hasher_factory(hashing_algorithm, chunk_size),
281            max_workers=max_workers,
282            allow_symlinks=allow_symlinks,
283            ignore_paths=ignore_paths,
284        )
285        return self

Configures serialization to build a manifest of (file, hash) pairs.

The serialization method in this configuration is changed to one where every file in the model is paired with its digest and a manifest containing all these pairings is being built.

Arguments:
  • hashing_algorithm: The hashing algorithm to use to hash a file.
  • chunk_size: The amount of file to read at once. Default is 1MB. A special value of 0 signals to attempt to read everything in a single call.
  • max_workers: Maximum number of workers to use in parallel. Default is to defer to the concurrent.futures library to select the best value for the current machine.
  • allow_symlinks: Controls whether symbolic links are included. If a symlink is present but the flag is False (default) the serialization would raise an error.
Returns:

The new hashing configuration with the new serialization method.

def use_shard_serialization( self, *, hashing_algorithm: Literal['sha256', 'blake2'] = 'sha256', chunk_size: int = 1048576, shard_size: int = 1000000000, max_workers: Optional[int] = None, allow_symlinks: bool = False, ignore_paths: Iterable[pathlib.Path] = frozenset()) -> Self:
287    def use_shard_serialization(
288        self,
289        *,
290        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
291        chunk_size: int = 1048576,
292        shard_size: int = 1_000_000_000,
293        max_workers: Optional[int] = None,
294        allow_symlinks: bool = False,
295        ignore_paths: Iterable[pathlib.Path] = frozenset(),
296    ) -> Self:
297        """Configures serialization to build a manifest of (shard, hash) pairs.
298
299        The serialization method in this configuration is changed to one where
300        every file in the model is sharded in equal sized shards, every shard is
301        paired with its digest and a manifest containing all these pairings is
302        being built.
303
304        Args:
305            hashing_algorithm: The hashing algorithm to use to hash a shard.
306            chunk_size: The amount of file to read at once. Default is 1MB. A
307              special value of 0 signals to attempt to read everything in a
308              single call.
309            shard_size: The size of a file shard. Default is 1 GB.
310            max_workers: Maximum number of workers to use in parallel. Default
311              is to defer to the `concurrent.futures` library to select the best
312              value for the current machine.
313            allow_symlinks: Controls whether symbolic links are included. If a
314              symlink is present but the flag is `False` (default) the
315              serialization would raise an error.
316            ignore_paths: Paths of files to ignore.
317
318        Returns:
319            The new hashing configuration with the new serialization method.
320        """
321        self._serializer = file_shard.Serializer(
322            self._build_sharded_file_hasher_factory(
323                hashing_algorithm, chunk_size, shard_size
324            ),
325            max_workers=max_workers,
326            allow_symlinks=allow_symlinks,
327            ignore_paths=ignore_paths,
328        )
329        return self

Configures serialization to build a manifest of (shard, hash) pairs.

The serialization method in this configuration is changed to one where every file in the model is sharded in equal sized shards, every shard is paired with its digest and a manifest containing all these pairings is being built.

Arguments:
  • hashing_algorithm: The hashing algorithm to use to hash a shard.
  • chunk_size: The amount of file to read at once. Default is 1MB. A special value of 0 signals to attempt to read everything in a single call.
  • shard_size: The size of a file shard. Default is 1 GB.
  • max_workers: Maximum number of workers to use in parallel. Default is to defer to the concurrent.futures library to select the best value for the current machine.
  • allow_symlinks: Controls whether symbolic links are included. If a symlink is present but the flag is False (default) the serialization would raise an error.
  • ignore_paths: Paths of files to ignore.
Returns:

The new hashing configuration with the new serialization method.

def set_ignored_paths( self, *, paths: Iterable[typing.Union[str, bytes, os.PathLike]], ignore_git_paths: bool = True) -> Self:
331    def set_ignored_paths(
332        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
333    ) -> Self:
334        """Configures the paths to be ignored during serialization of a model.
335
336        If the model is a single file, there are no paths that are ignored. If
337        the model is a directory, all paths are considered as relative to the
338        model directory, since we never look at files outside of it.
339
340        If an ignored path is a directory, serialization will ignore both the
341        path and any of its children.
342
343        Args:
344            paths: The paths to ignore.
345            ignore_git_paths: Whether to ignore git related paths (default) or
346              include them in the signature.
347
348        Returns:
349            The new hashing configuration with a new set of ignored paths.
350        """
351        # Use relpath to possibly fix weird paths like '../a/b' -> 'b'
352        # when '../a/' is a no-op
353        self._ignored_paths = frozenset(
354            {pathlib.Path(p).resolve() for p in paths}
355        )
356        self._ignore_git_paths = ignore_git_paths
357        return self

Configures the paths to be ignored during serialization of a model.

If the model is a single file, there are no paths that are ignored. If the model is a directory, all paths are considered as relative to the model directory, since we never look at files outside of it.

If an ignored path is a directory, serialization will ignore both the path and any of its children.

Arguments:
  • paths: The paths to ignore.
  • ignore_git_paths: Whether to ignore git related paths (default) or include them in the signature.
Returns:

The new hashing configuration with a new set of ignored paths.

def add_ignored_paths( self, *, model_path: Union[str, bytes, os.PathLike], paths: Iterable[typing.Union[str, bytes, os.PathLike]]) -> None:
359    def add_ignored_paths(
360        self, *, model_path: PathLike, paths: Iterable[PathLike]
361    ) -> None:
362        """Add more paths to ignore to existing set of paths.
363
364        Args:
365            model_path: The path to the model
366            paths: Additional paths to ignore. All path must be relative to
367                   the model directory.
368        """
369        newset = set(self._ignored_paths)
370        newset.update([os.path.join(model_path, p) for p in paths])
371        self._ignored_paths = newset

Add more paths to ignore to existing set of paths.

Arguments:
  • model_path: The path to the model
  • paths: Additional paths to ignore. All path must be relative to the model directory.