Edit on GitHub

model_signing.hashing

High level API for the hashing interface of model_signing library.

Hashing is used both for signing and verification and users should ensure that the same configuration is used in both cases.

The module could also be used to just hash a single model, without signing it:

This module allows setting up the hashing configuration to a single variable and then sharing it between signing and verification.

hashing_config = model_signing.hashing.Config().set_ignored_paths(
    paths=["README.md"], ignore_git_paths=True
)

signing_config = (
    model_signing.signing.Config()
    .use_elliptic_key_signer(private_key="key")
    .set_hashing_config(hashing_config)
)

verifying_config = (
    model_signing.verifying.Config()
    .use_elliptic_key_verifier(public_key="key.pub")
    .set_hashing_config(hashing_config)
)

The API defined here is stable and backwards compatible.

  1# Copyright 2024 The Sigstore Authors
  2#
  3# Licensed under the Apache License, Version 2.0 (the "License");
  4# you may not use this file except in compliance with the License.
  5# You may obtain a copy of the License at
  6#
  7#      http://www.apache.org/licenses/LICENSE-2.0
  8#
  9# Unless required by applicable law or agreed to in writing, software
 10# distributed under the License is distributed on an "AS IS" BASIS,
 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12# See the License for the specific language governing permissions and
 13# limitations under the License.
 14
 15"""High level API for the hashing interface of `model_signing` library.
 16
 17Hashing is used both for signing and verification and users should ensure that
 18the same configuration is used in both cases.
 19
 20The module could also be used to just hash a single model, without signing it:
 21
 22```python
 23model_signing.hashing.hash(model_path)
 24```
 25
 26This module allows setting up the hashing configuration to a single variable and
 27then sharing it between signing and verification.
 28
 29```python
 30hashing_config = model_signing.hashing.Config().set_ignored_paths(
 31    paths=["README.md"], ignore_git_paths=True
 32)
 33
 34signing_config = (
 35    model_signing.signing.Config()
 36    .use_elliptic_key_signer(private_key="key")
 37    .set_hashing_config(hashing_config)
 38)
 39
 40verifying_config = (
 41    model_signing.verifying.Config()
 42    .use_elliptic_key_verifier(public_key="key.pub")
 43    .set_hashing_config(hashing_config)
 44)
 45```
 46
 47The API defined here is stable and backwards compatible.
 48"""
 49
 50from collections.abc import Callable, Iterable
 51import os
 52import pathlib
 53import sys
 54from typing import Literal, Optional, Union
 55
 56from model_signing import manifest
 57from model_signing._hashing import hashing
 58from model_signing._hashing import io
 59from model_signing._hashing import memory
 60from model_signing._serialization import file
 61from model_signing._serialization import file_shard
 62
 63
 64if sys.version_info >= (3, 11):
 65    from typing import Self
 66else:
 67    from typing_extensions import Self
 68
 69
 70# `TypeAlias` only exists from Python 3.10
 71# `TypeAlias` is deprecated in Python 3.12 in favor of `type`
 72if sys.version_info >= (3, 10):
 73    from typing import TypeAlias
 74else:
 75    from typing_extensions import TypeAlias
 76
 77
 78# Type alias to support `os.PathLike`, `str` and `bytes` objects in the API
 79# When Python 3.12 is the minimum supported version we can use `type`
 80# When Python 3.11 is the minimum supported version we can use `|`
 81PathLike: TypeAlias = Union[str, bytes, os.PathLike]
 82
 83
 84def hash(model_path: PathLike) -> manifest.Manifest:
 85    """Hashes a model using the default configuration.
 86
 87    Hashing is the shared part between signing and verification and is also
 88    expected to be the slowest component. When serializing a model, we need to
 89    spend time proportional to the model size on disk.
 90
 91    This method returns a "manifest" of the model. A manifest is a collection of
 92    every object in the model, paired with the corresponding hash. Currently, we
 93    consider an object in the model to be either a file or a shard of the file.
 94    Large models with large files will be hashed much faster when every shard is
 95    hashed in parallel, at the cost of generating a larger payload for the
 96    signature. In future releases we could support hashing individual tensors or
 97    tensor slices for further speed optimizations for very large models.
 98
 99    Args:
100        model_path: The path to the model to hash.
101
102    Returns:
103        A manifest of the hashed model.
104    """
105    return Config().hash(model_path)
106
107
108class Config:
109    """Configuration to use when hashing models.
110
111    Hashing is the shared part between signing and verification and is also
112    expected to be the slowest component. When serializing a model, we need to
113    spend time proportional to the model size on disk.
114
115    Hashing builds a "manifest" of the model. A manifest is a collection of
116    every object in the model, paired with the corresponding hash. Currently, we
117    consider an object in the model to be either a file or a shard of the file.
118    Large models with large files will be hashed much faster when every shard is
119    hashed in parallel, at the cost of generating a larger payload for the
120    signature. In future releases we could support hashing individual tensors or
121    tensor slices for further speed optimizations for very large models.
122
123    This configuration class supports configuring the hashing granularity. By
124    default, we hash at file level granularity.
125
126    This configuration class also supports configuring the hash method used to
127    generate the hash for every object in the model. We currently support SHA256
128    and BLAKE2, with SHA256 being the default.
129
130    This configuration class also supports configuring which paths from the
131    model directory should be ignored. These are files that doesn't impact the
132    behavior of the model, or files that won't be distributed with the model. By
133    default, only files that are associated with a git repository (`.git`,
134    `.gitattributes`, `.gitignore`, etc.) are ignored.
135    """
136
137    def __init__(self):
138        """Initializes the default configuration for hashing."""
139        self._ignored_paths = frozenset()
140        self._ignore_git_paths = True
141        self.use_file_serialization()
142        self._allow_symlinks = False
143
144    def hash(self, model_path: PathLike) -> manifest.Manifest:
145        """Hashes a model using the current configuration."""
146        # All paths in ignored_paths must have model_path as prefix
147        ignored_paths = []
148        for p in self._ignored_paths:
149            rp = os.path.relpath(p, model_path)
150            # rp may start with "../" if it is not relative to model_path
151            if not rp.startswith("../"):
152                ignored_paths.append(pathlib.Path(os.path.join(model_path, rp)))
153
154        if self._ignore_git_paths:
155            ignored_paths.extend(
156                [
157                    os.path.join(model_path, p)
158                    for p in [
159                        ".git/",
160                        ".gitattributes",
161                        ".github/",
162                        ".gitignore",
163                    ]
164                ]
165            )
166
167        self._serializer.set_allow_symlinks(self._allow_symlinks)
168
169        return self._serializer.serialize(
170            pathlib.Path(model_path), ignore_paths=ignored_paths
171        )
172
173    def _build_stream_hasher(
174        self, hashing_algorithm: Literal["sha256", "blake2"] = "sha256"
175    ) -> hashing.StreamingHashEngine:
176        """Builds a streaming hasher from a constant string.
177
178        Args:
179            hashing_algorithm: The hashing algorithm to use.
180
181        Returns:
182            An instance of the requested hasher.
183        """
184        # TODO: Once Python 3.9 support is deprecated revert to using `match`
185        if hashing_algorithm == "sha256":
186            return memory.SHA256()
187        if hashing_algorithm == "blake2":
188            return memory.BLAKE2()
189
190        raise ValueError(f"Unsupported hashing method {hashing_algorithm}")
191
192    def _build_file_hasher_factory(
193        self,
194        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
195        chunk_size: int = 1048576,
196    ) -> Callable[[pathlib.Path], io.SimpleFileHasher]:
197        """Builds the hasher factory for a serialization by file.
198
199        Args:
200            hashing_algorithm: The hashing algorithm to use to hash a file.
201            chunk_size: The amount of file to read at once. Default is 1MB. A
202              special value of 0 signals to attempt to read everything in a
203              single call.
204
205        Returns:
206            The hasher factory that should be used by the active serialization
207            method.
208        """
209
210        def _factory(path: pathlib.Path) -> io.SimpleFileHasher:
211            hasher = self._build_stream_hasher(hashing_algorithm)
212            return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size)
213
214        return _factory
215
216    def _build_sharded_file_hasher_factory(
217        self,
218        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
219        chunk_size: int = 1048576,
220        shard_size: int = 1_000_000_000,
221    ) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]:
222        """Builds the hasher factory for a serialization by file shards.
223
224        Args:
225            hashing_algorithm: The hashing algorithm to use to hash a shard.
226            chunk_size: The amount of file to read at once. Default is 1MB. A
227              special value of 0 signals to attempt to read everything in a
228              single call.
229            shard_size: The size of a file shard. Default is 1 GB.
230
231        Returns:
232            The hasher factory that should be used by the active serialization
233            method.
234        """
235
236        def _factory(
237            path: pathlib.Path, start: int, end: int
238        ) -> io.ShardedFileHasher:
239            hasher = self._build_stream_hasher(hashing_algorithm)
240            return io.ShardedFileHasher(
241                path,
242                hasher,
243                start=start,
244                end=end,
245                chunk_size=chunk_size,
246                shard_size=shard_size,
247            )
248
249        return _factory
250
251    def use_file_serialization(
252        self,
253        *,
254        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
255        chunk_size: int = 1048576,
256        max_workers: Optional[int] = None,
257        allow_symlinks: bool = False,
258        ignore_paths: Iterable[pathlib.Path] = frozenset(),
259    ) -> Self:
260        """Configures serialization to build a manifest of (file, hash) pairs.
261
262        The serialization method in this configuration is changed to one where
263        every file in the model is paired with its digest and a manifest
264        containing all these pairings is being built.
265
266        Args:
267            hashing_algorithm: The hashing algorithm to use to hash a file.
268            chunk_size: The amount of file to read at once. Default is 1MB. A
269              special value of 0 signals to attempt to read everything in a
270              single call.
271            max_workers: Maximum number of workers to use in parallel. Default
272              is to defer to the `concurrent.futures` library to select the best
273              value for the current machine.
274            allow_symlinks: Controls whether symbolic links are included. If a
275              symlink is present but the flag is `False` (default) the
276              serialization would raise an error.
277
278        Returns:
279            The new hashing configuration with the new serialization method.
280        """
281        self._serializer = file.Serializer(
282            self._build_file_hasher_factory(hashing_algorithm, chunk_size),
283            max_workers=max_workers,
284            allow_symlinks=allow_symlinks,
285            ignore_paths=ignore_paths,
286        )
287        return self
288
289    def use_shard_serialization(
290        self,
291        *,
292        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
293        chunk_size: int = 1048576,
294        shard_size: int = 1_000_000_000,
295        max_workers: Optional[int] = None,
296        allow_symlinks: bool = False,
297        ignore_paths: Iterable[pathlib.Path] = frozenset(),
298    ) -> Self:
299        """Configures serialization to build a manifest of (shard, hash) pairs.
300
301        The serialization method in this configuration is changed to one where
302        every file in the model is sharded in equal sized shards, every shard is
303        paired with its digest and a manifest containing all these pairings is
304        being built.
305
306        Args:
307            hashing_algorithm: The hashing algorithm to use to hash a shard.
308            chunk_size: The amount of file to read at once. Default is 1MB. A
309              special value of 0 signals to attempt to read everything in a
310              single call.
311            shard_size: The size of a file shard. Default is 1 GB.
312            max_workers: Maximum number of workers to use in parallel. Default
313              is to defer to the `concurrent.futures` library to select the best
314              value for the current machine.
315            allow_symlinks: Controls whether symbolic links are included. If a
316              symlink is present but the flag is `False` (default) the
317              serialization would raise an error.
318            ignore_paths: Paths of files to ignore.
319
320        Returns:
321            The new hashing configuration with the new serialization method.
322        """
323        self._serializer = file_shard.Serializer(
324            self._build_sharded_file_hasher_factory(
325                hashing_algorithm, chunk_size, shard_size
326            ),
327            max_workers=max_workers,
328            allow_symlinks=allow_symlinks,
329            ignore_paths=ignore_paths,
330        )
331        return self
332
333    def set_ignored_paths(
334        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
335    ) -> Self:
336        """Configures the paths to be ignored during serialization of a model.
337
338        If the model is a single file, there are no paths that are ignored. If
339        the model is a directory, all paths are considered as relative to the
340        model directory, since we never look at files outside of it.
341
342        If an ignored path is a directory, serialization will ignore both the
343        path and any of its children.
344
345        Args:
346            paths: The paths to ignore.
347            ignore_git_paths: Whether to ignore git related paths (default) or
348              include them in the signature.
349
350        Returns:
351            The new hashing configuration with a new set of ignored paths.
352        """
353        # Use relpath to possibly fix weird paths like '../a/b' -> 'b'
354        # when '../a/' is a no-op
355        self._ignored_paths = frozenset(
356            {pathlib.Path(p).resolve() for p in paths}
357        )
358        self._ignore_git_paths = ignore_git_paths
359        return self
360
361    def add_ignored_paths(
362        self, *, model_path: PathLike, paths: Iterable[PathLike]
363    ) -> None:
364        """Add more paths to ignore to existing set of paths.
365
366        Args:
367            model_path: The path to the model
368            paths: Additional paths to ignore. All path must be relative to
369                   the model directory.
370        """
371        newset = set(self._ignored_paths)
372        newset.update([os.path.join(model_path, p) for p in paths])
373        self._ignored_paths = newset
374
375    def set_allow_symlinks(self, allow_symlinks: bool) -> Self:
376        """Set whether following symlinks is allowed."""
377        self._allow_symlinks = allow_symlinks
378        return self
PathLike: TypeAlias = Union[str, bytes, os.PathLike]
def hash( model_path: Union[str, bytes, os.PathLike]) -> model_signing.manifest.Manifest:
 85def hash(model_path: PathLike) -> manifest.Manifest:
 86    """Hashes a model using the default configuration.
 87
 88    Hashing is the shared part between signing and verification and is also
 89    expected to be the slowest component. When serializing a model, we need to
 90    spend time proportional to the model size on disk.
 91
 92    This method returns a "manifest" of the model. A manifest is a collection of
 93    every object in the model, paired with the corresponding hash. Currently, we
 94    consider an object in the model to be either a file or a shard of the file.
 95    Large models with large files will be hashed much faster when every shard is
 96    hashed in parallel, at the cost of generating a larger payload for the
 97    signature. In future releases we could support hashing individual tensors or
 98    tensor slices for further speed optimizations for very large models.
 99
100    Args:
101        model_path: The path to the model to hash.
102
103    Returns:
104        A manifest of the hashed model.
105    """
106    return Config().hash(model_path)

Hashes a model using the default configuration.

Hashing is the shared part between signing and verification and is also expected to be the slowest component. When serializing a model, we need to spend time proportional to the model size on disk.

This method returns a "manifest" of the model. A manifest is a collection of every object in the model, paired with the corresponding hash. Currently, we consider an object in the model to be either a file or a shard of the file. Large models with large files will be hashed much faster when every shard is hashed in parallel, at the cost of generating a larger payload for the signature. In future releases we could support hashing individual tensors or tensor slices for further speed optimizations for very large models.

Arguments:
  • model_path: The path to the model to hash.
Returns:

A manifest of the hashed model.

class Config:
109class Config:
110    """Configuration to use when hashing models.
111
112    Hashing is the shared part between signing and verification and is also
113    expected to be the slowest component. When serializing a model, we need to
114    spend time proportional to the model size on disk.
115
116    Hashing builds a "manifest" of the model. A manifest is a collection of
117    every object in the model, paired with the corresponding hash. Currently, we
118    consider an object in the model to be either a file or a shard of the file.
119    Large models with large files will be hashed much faster when every shard is
120    hashed in parallel, at the cost of generating a larger payload for the
121    signature. In future releases we could support hashing individual tensors or
122    tensor slices for further speed optimizations for very large models.
123
124    This configuration class supports configuring the hashing granularity. By
125    default, we hash at file level granularity.
126
127    This configuration class also supports configuring the hash method used to
128    generate the hash for every object in the model. We currently support SHA256
129    and BLAKE2, with SHA256 being the default.
130
131    This configuration class also supports configuring which paths from the
132    model directory should be ignored. These are files that doesn't impact the
133    behavior of the model, or files that won't be distributed with the model. By
134    default, only files that are associated with a git repository (`.git`,
135    `.gitattributes`, `.gitignore`, etc.) are ignored.
136    """
137
138    def __init__(self):
139        """Initializes the default configuration for hashing."""
140        self._ignored_paths = frozenset()
141        self._ignore_git_paths = True
142        self.use_file_serialization()
143        self._allow_symlinks = False
144
145    def hash(self, model_path: PathLike) -> manifest.Manifest:
146        """Hashes a model using the current configuration."""
147        # All paths in ignored_paths must have model_path as prefix
148        ignored_paths = []
149        for p in self._ignored_paths:
150            rp = os.path.relpath(p, model_path)
151            # rp may start with "../" if it is not relative to model_path
152            if not rp.startswith("../"):
153                ignored_paths.append(pathlib.Path(os.path.join(model_path, rp)))
154
155        if self._ignore_git_paths:
156            ignored_paths.extend(
157                [
158                    os.path.join(model_path, p)
159                    for p in [
160                        ".git/",
161                        ".gitattributes",
162                        ".github/",
163                        ".gitignore",
164                    ]
165                ]
166            )
167
168        self._serializer.set_allow_symlinks(self._allow_symlinks)
169
170        return self._serializer.serialize(
171            pathlib.Path(model_path), ignore_paths=ignored_paths
172        )
173
174    def _build_stream_hasher(
175        self, hashing_algorithm: Literal["sha256", "blake2"] = "sha256"
176    ) -> hashing.StreamingHashEngine:
177        """Builds a streaming hasher from a constant string.
178
179        Args:
180            hashing_algorithm: The hashing algorithm to use.
181
182        Returns:
183            An instance of the requested hasher.
184        """
185        # TODO: Once Python 3.9 support is deprecated revert to using `match`
186        if hashing_algorithm == "sha256":
187            return memory.SHA256()
188        if hashing_algorithm == "blake2":
189            return memory.BLAKE2()
190
191        raise ValueError(f"Unsupported hashing method {hashing_algorithm}")
192
193    def _build_file_hasher_factory(
194        self,
195        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
196        chunk_size: int = 1048576,
197    ) -> Callable[[pathlib.Path], io.SimpleFileHasher]:
198        """Builds the hasher factory for a serialization by file.
199
200        Args:
201            hashing_algorithm: The hashing algorithm to use to hash a file.
202            chunk_size: The amount of file to read at once. Default is 1MB. A
203              special value of 0 signals to attempt to read everything in a
204              single call.
205
206        Returns:
207            The hasher factory that should be used by the active serialization
208            method.
209        """
210
211        def _factory(path: pathlib.Path) -> io.SimpleFileHasher:
212            hasher = self._build_stream_hasher(hashing_algorithm)
213            return io.SimpleFileHasher(path, hasher, chunk_size=chunk_size)
214
215        return _factory
216
217    def _build_sharded_file_hasher_factory(
218        self,
219        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
220        chunk_size: int = 1048576,
221        shard_size: int = 1_000_000_000,
222    ) -> Callable[[pathlib.Path, int, int], io.ShardedFileHasher]:
223        """Builds the hasher factory for a serialization by file shards.
224
225        Args:
226            hashing_algorithm: The hashing algorithm to use to hash a shard.
227            chunk_size: The amount of file to read at once. Default is 1MB. A
228              special value of 0 signals to attempt to read everything in a
229              single call.
230            shard_size: The size of a file shard. Default is 1 GB.
231
232        Returns:
233            The hasher factory that should be used by the active serialization
234            method.
235        """
236
237        def _factory(
238            path: pathlib.Path, start: int, end: int
239        ) -> io.ShardedFileHasher:
240            hasher = self._build_stream_hasher(hashing_algorithm)
241            return io.ShardedFileHasher(
242                path,
243                hasher,
244                start=start,
245                end=end,
246                chunk_size=chunk_size,
247                shard_size=shard_size,
248            )
249
250        return _factory
251
252    def use_file_serialization(
253        self,
254        *,
255        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
256        chunk_size: int = 1048576,
257        max_workers: Optional[int] = None,
258        allow_symlinks: bool = False,
259        ignore_paths: Iterable[pathlib.Path] = frozenset(),
260    ) -> Self:
261        """Configures serialization to build a manifest of (file, hash) pairs.
262
263        The serialization method in this configuration is changed to one where
264        every file in the model is paired with its digest and a manifest
265        containing all these pairings is being built.
266
267        Args:
268            hashing_algorithm: The hashing algorithm to use to hash a file.
269            chunk_size: The amount of file to read at once. Default is 1MB. A
270              special value of 0 signals to attempt to read everything in a
271              single call.
272            max_workers: Maximum number of workers to use in parallel. Default
273              is to defer to the `concurrent.futures` library to select the best
274              value for the current machine.
275            allow_symlinks: Controls whether symbolic links are included. If a
276              symlink is present but the flag is `False` (default) the
277              serialization would raise an error.
278
279        Returns:
280            The new hashing configuration with the new serialization method.
281        """
282        self._serializer = file.Serializer(
283            self._build_file_hasher_factory(hashing_algorithm, chunk_size),
284            max_workers=max_workers,
285            allow_symlinks=allow_symlinks,
286            ignore_paths=ignore_paths,
287        )
288        return self
289
290    def use_shard_serialization(
291        self,
292        *,
293        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
294        chunk_size: int = 1048576,
295        shard_size: int = 1_000_000_000,
296        max_workers: Optional[int] = None,
297        allow_symlinks: bool = False,
298        ignore_paths: Iterable[pathlib.Path] = frozenset(),
299    ) -> Self:
300        """Configures serialization to build a manifest of (shard, hash) pairs.
301
302        The serialization method in this configuration is changed to one where
303        every file in the model is sharded in equal sized shards, every shard is
304        paired with its digest and a manifest containing all these pairings is
305        being built.
306
307        Args:
308            hashing_algorithm: The hashing algorithm to use to hash a shard.
309            chunk_size: The amount of file to read at once. Default is 1MB. A
310              special value of 0 signals to attempt to read everything in a
311              single call.
312            shard_size: The size of a file shard. Default is 1 GB.
313            max_workers: Maximum number of workers to use in parallel. Default
314              is to defer to the `concurrent.futures` library to select the best
315              value for the current machine.
316            allow_symlinks: Controls whether symbolic links are included. If a
317              symlink is present but the flag is `False` (default) the
318              serialization would raise an error.
319            ignore_paths: Paths of files to ignore.
320
321        Returns:
322            The new hashing configuration with the new serialization method.
323        """
324        self._serializer = file_shard.Serializer(
325            self._build_sharded_file_hasher_factory(
326                hashing_algorithm, chunk_size, shard_size
327            ),
328            max_workers=max_workers,
329            allow_symlinks=allow_symlinks,
330            ignore_paths=ignore_paths,
331        )
332        return self
333
334    def set_ignored_paths(
335        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
336    ) -> Self:
337        """Configures the paths to be ignored during serialization of a model.
338
339        If the model is a single file, there are no paths that are ignored. If
340        the model is a directory, all paths are considered as relative to the
341        model directory, since we never look at files outside of it.
342
343        If an ignored path is a directory, serialization will ignore both the
344        path and any of its children.
345
346        Args:
347            paths: The paths to ignore.
348            ignore_git_paths: Whether to ignore git related paths (default) or
349              include them in the signature.
350
351        Returns:
352            The new hashing configuration with a new set of ignored paths.
353        """
354        # Use relpath to possibly fix weird paths like '../a/b' -> 'b'
355        # when '../a/' is a no-op
356        self._ignored_paths = frozenset(
357            {pathlib.Path(p).resolve() for p in paths}
358        )
359        self._ignore_git_paths = ignore_git_paths
360        return self
361
362    def add_ignored_paths(
363        self, *, model_path: PathLike, paths: Iterable[PathLike]
364    ) -> None:
365        """Add more paths to ignore to existing set of paths.
366
367        Args:
368            model_path: The path to the model
369            paths: Additional paths to ignore. All path must be relative to
370                   the model directory.
371        """
372        newset = set(self._ignored_paths)
373        newset.update([os.path.join(model_path, p) for p in paths])
374        self._ignored_paths = newset
375
376    def set_allow_symlinks(self, allow_symlinks: bool) -> Self:
377        """Set whether following symlinks is allowed."""
378        self._allow_symlinks = allow_symlinks
379        return self

Configuration to use when hashing models.

Hashing is the shared part between signing and verification and is also expected to be the slowest component. When serializing a model, we need to spend time proportional to the model size on disk.

Hashing builds a "manifest" of the model. A manifest is a collection of every object in the model, paired with the corresponding hash. Currently, we consider an object in the model to be either a file or a shard of the file. Large models with large files will be hashed much faster when every shard is hashed in parallel, at the cost of generating a larger payload for the signature. In future releases we could support hashing individual tensors or tensor slices for further speed optimizations for very large models.

This configuration class supports configuring the hashing granularity. By default, we hash at file level granularity.

This configuration class also supports configuring the hash method used to generate the hash for every object in the model. We currently support SHA256 and BLAKE2, with SHA256 being the default.

This configuration class also supports configuring which paths from the model directory should be ignored. These are files that doesn't impact the behavior of the model, or files that won't be distributed with the model. By default, only files that are associated with a git repository (.git, .gitattributes, .gitignore, etc.) are ignored.

Config()
138    def __init__(self):
139        """Initializes the default configuration for hashing."""
140        self._ignored_paths = frozenset()
141        self._ignore_git_paths = True
142        self.use_file_serialization()
143        self._allow_symlinks = False

Initializes the default configuration for hashing.

def hash( self, model_path: Union[str, bytes, os.PathLike]) -> model_signing.manifest.Manifest:
145    def hash(self, model_path: PathLike) -> manifest.Manifest:
146        """Hashes a model using the current configuration."""
147        # All paths in ignored_paths must have model_path as prefix
148        ignored_paths = []
149        for p in self._ignored_paths:
150            rp = os.path.relpath(p, model_path)
151            # rp may start with "../" if it is not relative to model_path
152            if not rp.startswith("../"):
153                ignored_paths.append(pathlib.Path(os.path.join(model_path, rp)))
154
155        if self._ignore_git_paths:
156            ignored_paths.extend(
157                [
158                    os.path.join(model_path, p)
159                    for p in [
160                        ".git/",
161                        ".gitattributes",
162                        ".github/",
163                        ".gitignore",
164                    ]
165                ]
166            )
167
168        self._serializer.set_allow_symlinks(self._allow_symlinks)
169
170        return self._serializer.serialize(
171            pathlib.Path(model_path), ignore_paths=ignored_paths
172        )

Hashes a model using the current configuration.

def use_file_serialization( self, *, hashing_algorithm: Literal['sha256', 'blake2'] = 'sha256', chunk_size: int = 1048576, max_workers: Optional[int] = None, allow_symlinks: bool = False, ignore_paths: Iterable[pathlib.Path] = frozenset()) -> Self:
252    def use_file_serialization(
253        self,
254        *,
255        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
256        chunk_size: int = 1048576,
257        max_workers: Optional[int] = None,
258        allow_symlinks: bool = False,
259        ignore_paths: Iterable[pathlib.Path] = frozenset(),
260    ) -> Self:
261        """Configures serialization to build a manifest of (file, hash) pairs.
262
263        The serialization method in this configuration is changed to one where
264        every file in the model is paired with its digest and a manifest
265        containing all these pairings is being built.
266
267        Args:
268            hashing_algorithm: The hashing algorithm to use to hash a file.
269            chunk_size: The amount of file to read at once. Default is 1MB. A
270              special value of 0 signals to attempt to read everything in a
271              single call.
272            max_workers: Maximum number of workers to use in parallel. Default
273              is to defer to the `concurrent.futures` library to select the best
274              value for the current machine.
275            allow_symlinks: Controls whether symbolic links are included. If a
276              symlink is present but the flag is `False` (default) the
277              serialization would raise an error.
278
279        Returns:
280            The new hashing configuration with the new serialization method.
281        """
282        self._serializer = file.Serializer(
283            self._build_file_hasher_factory(hashing_algorithm, chunk_size),
284            max_workers=max_workers,
285            allow_symlinks=allow_symlinks,
286            ignore_paths=ignore_paths,
287        )
288        return self

Configures serialization to build a manifest of (file, hash) pairs.

The serialization method in this configuration is changed to one where every file in the model is paired with its digest and a manifest containing all these pairings is being built.

Arguments:
  • hashing_algorithm: The hashing algorithm to use to hash a file.
  • chunk_size: The amount of file to read at once. Default is 1MB. A special value of 0 signals to attempt to read everything in a single call.
  • max_workers: Maximum number of workers to use in parallel. Default is to defer to the concurrent.futures library to select the best value for the current machine.
  • allow_symlinks: Controls whether symbolic links are included. If a symlink is present but the flag is False (default) the serialization would raise an error.
Returns:

The new hashing configuration with the new serialization method.

def use_shard_serialization( self, *, hashing_algorithm: Literal['sha256', 'blake2'] = 'sha256', chunk_size: int = 1048576, shard_size: int = 1000000000, max_workers: Optional[int] = None, allow_symlinks: bool = False, ignore_paths: Iterable[pathlib.Path] = frozenset()) -> Self:
290    def use_shard_serialization(
291        self,
292        *,
293        hashing_algorithm: Literal["sha256", "blake2"] = "sha256",
294        chunk_size: int = 1048576,
295        shard_size: int = 1_000_000_000,
296        max_workers: Optional[int] = None,
297        allow_symlinks: bool = False,
298        ignore_paths: Iterable[pathlib.Path] = frozenset(),
299    ) -> Self:
300        """Configures serialization to build a manifest of (shard, hash) pairs.
301
302        The serialization method in this configuration is changed to one where
303        every file in the model is sharded in equal sized shards, every shard is
304        paired with its digest and a manifest containing all these pairings is
305        being built.
306
307        Args:
308            hashing_algorithm: The hashing algorithm to use to hash a shard.
309            chunk_size: The amount of file to read at once. Default is 1MB. A
310              special value of 0 signals to attempt to read everything in a
311              single call.
312            shard_size: The size of a file shard. Default is 1 GB.
313            max_workers: Maximum number of workers to use in parallel. Default
314              is to defer to the `concurrent.futures` library to select the best
315              value for the current machine.
316            allow_symlinks: Controls whether symbolic links are included. If a
317              symlink is present but the flag is `False` (default) the
318              serialization would raise an error.
319            ignore_paths: Paths of files to ignore.
320
321        Returns:
322            The new hashing configuration with the new serialization method.
323        """
324        self._serializer = file_shard.Serializer(
325            self._build_sharded_file_hasher_factory(
326                hashing_algorithm, chunk_size, shard_size
327            ),
328            max_workers=max_workers,
329            allow_symlinks=allow_symlinks,
330            ignore_paths=ignore_paths,
331        )
332        return self

Configures serialization to build a manifest of (shard, hash) pairs.

The serialization method in this configuration is changed to one where every file in the model is sharded in equal sized shards, every shard is paired with its digest and a manifest containing all these pairings is being built.

Arguments:
  • hashing_algorithm: The hashing algorithm to use to hash a shard.
  • chunk_size: The amount of file to read at once. Default is 1MB. A special value of 0 signals to attempt to read everything in a single call.
  • shard_size: The size of a file shard. Default is 1 GB.
  • max_workers: Maximum number of workers to use in parallel. Default is to defer to the concurrent.futures library to select the best value for the current machine.
  • allow_symlinks: Controls whether symbolic links are included. If a symlink is present but the flag is False (default) the serialization would raise an error.
  • ignore_paths: Paths of files to ignore.
Returns:

The new hashing configuration with the new serialization method.

def set_ignored_paths( self, *, paths: Iterable[typing.Union[str, bytes, os.PathLike]], ignore_git_paths: bool = True) -> Self:
334    def set_ignored_paths(
335        self, *, paths: Iterable[PathLike], ignore_git_paths: bool = True
336    ) -> Self:
337        """Configures the paths to be ignored during serialization of a model.
338
339        If the model is a single file, there are no paths that are ignored. If
340        the model is a directory, all paths are considered as relative to the
341        model directory, since we never look at files outside of it.
342
343        If an ignored path is a directory, serialization will ignore both the
344        path and any of its children.
345
346        Args:
347            paths: The paths to ignore.
348            ignore_git_paths: Whether to ignore git related paths (default) or
349              include them in the signature.
350
351        Returns:
352            The new hashing configuration with a new set of ignored paths.
353        """
354        # Use relpath to possibly fix weird paths like '../a/b' -> 'b'
355        # when '../a/' is a no-op
356        self._ignored_paths = frozenset(
357            {pathlib.Path(p).resolve() for p in paths}
358        )
359        self._ignore_git_paths = ignore_git_paths
360        return self

Configures the paths to be ignored during serialization of a model.

If the model is a single file, there are no paths that are ignored. If the model is a directory, all paths are considered as relative to the model directory, since we never look at files outside of it.

If an ignored path is a directory, serialization will ignore both the path and any of its children.

Arguments:
  • paths: The paths to ignore.
  • ignore_git_paths: Whether to ignore git related paths (default) or include them in the signature.
Returns:

The new hashing configuration with a new set of ignored paths.

def add_ignored_paths( self, *, model_path: Union[str, bytes, os.PathLike], paths: Iterable[typing.Union[str, bytes, os.PathLike]]) -> None:
362    def add_ignored_paths(
363        self, *, model_path: PathLike, paths: Iterable[PathLike]
364    ) -> None:
365        """Add more paths to ignore to existing set of paths.
366
367        Args:
368            model_path: The path to the model
369            paths: Additional paths to ignore. All path must be relative to
370                   the model directory.
371        """
372        newset = set(self._ignored_paths)
373        newset.update([os.path.join(model_path, p) for p in paths])
374        self._ignored_paths = newset

Add more paths to ignore to existing set of paths.

Arguments:
  • model_path: The path to the model
  • paths: Additional paths to ignore. All path must be relative to the model directory.