InvokeAI/invokeai/backend/model_manager/hash.py

# Copyright (c) 2023 Lincoln D. Stein and the InvokeAI Development Team
"""
Fast hashing of diffusers and checkpoint-style models.

Usage:
from invokeai.backend.model_managre.model_hash import FastModelHash
>>> FastModelHash.hash('/home/models/stable-diffusion-v1.5')
'a8e693a126ea5b831c96064dc569956f'
"""
import hashlib
import os
from pathlib import Path
from typing import Literal, Union

from blake3 import blake3

MODEL_FILE_EXTENSIONS = (".ckpt", ".safetensors", ".bin", ".pt", ".pth")

ALGORITHMS = Literal[
    "md5",
    "sha1",
    "sha1_fast",
    "sha224",
    "sha256",
    "sha384",
    "sha512",
    "blake2b",
    "blake2s",
    "sha3_224",
    "sha3_256",
    "sha3_384",
    "sha3_512",
    "shake_128",
    "shake_256",
    "blake3",
]


class ModelHash:
    """ModelHash provides one public class method, hash()."""

    @classmethod
    def hash(cls, model_location: Union[str, Path], algorithm: ALGORITHMS = "blake3") -> str:
        """
        Return hexdigest string for model located at model_location.

        If model_location is a directory, the hash is computed by hashing the hashes of all model files in the
        directory. The final composite hash is always computed using BLAKE3.

        :param model_location: Path to the model
        :param algorithm: Hashing algorithm to use
        """
        model_location = Path(model_location)
        if model_location.is_file():
            return cls._hash_file(model_location, algorithm)
        elif model_location.is_dir():
            return cls._hash_dir(model_location, algorithm)
        else:
            raise OSError(f"Not a valid file or directory: {model_location}")

    @classmethod
    def _hash_file(cls, model_location: Union[str, Path], algorithm: ALGORITHMS) -> str:
        """
        Compute the hash for a single file and return its hexdigest.

        :param model_location: Path to the model file
        :param algorithm: Hashing algorithm to use
        """

        if algorithm == "blake3":
            return cls._blake3(model_location)
        elif algorithm == "sha1_fast":
            return cls._sha1_fast(model_location)
        elif algorithm in hashlib.algorithms_available:
            return cls._hashlib(model_location, algorithm)
        else:
            raise ValueError(f"Algorithm {algorithm} not available")

    @classmethod
    def _hash_dir(cls, model_location: Union[str, Path], algorithm: ALGORITHMS) -> str:
        """
        Compute the hash for all files in a directory and return a hexdigest.

        :param model_location: Path to the model directory
        :param algorithm: Hashing algorithm to use
        """
        components: list[str] = []

        for root, _dirs, files in os.walk(model_location):
            for file in files:
                # only tally tensor files because diffusers config files change slightly
                # depending on how the model was downloaded/converted.
                if file.endswith(MODEL_FILE_EXTENSIONS):
                    components.append((Path(root, file).as_posix()))

        component_hashes: list[str] = []
        for component in sorted(components):
            component_hashes.append(cls._hash_file(component, algorithm))

        # BLAKE3 is cryptographically secure. We may as well fall back on a secure algorithm
        # for the composite hash
        composite_hasher = blake3()
        for h in components:
            composite_hasher.update(h.encode("utf-8"))
        return composite_hasher.hexdigest()

    @staticmethod
    def _blake3(file_path: Union[str, Path]) -> str:
        """Hashes a file using BLAKE3"""
        file_hasher = blake3(max_threads=blake3.AUTO)
        file_hasher.update_mmap(file_path)
        return file_hasher.hexdigest()

    @staticmethod
    def _sha1_fast(file_path: Union[str, Path]) -> str:
        """Hashes a file using SHA1, but with a block size of 2**16. The result is not a standard SHA1 hash due to the
        # padding introduced by the block size. The algorithm is, however, very fast."""
        BLOCK_SIZE = 2**16
        file_hash = hashlib.sha1()
        with open(file_path, "rb") as f:
            data = f.read(BLOCK_SIZE)
            file_hash.update(data)
        return file_hash.hexdigest()

    @staticmethod
    def _hashlib(file_path: Union[str, Path], algorithm: ALGORITHMS) -> str:
        """Hashes a file using a hashlib algorithm"""
        file_hasher = hashlib.new(algorithm)
        with open(file_path, "rb") as f:
            file_hasher.update(f.read())
        return file_hasher.hexdigest()
add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`# Copyright (c) 2023 Lincoln D. Stein and the InvokeAI Development Team`
			`"""`
			`Fast hashing of diffusers and checkpoint-style models.`

			`Usage:`
			`from invokeai.backend.model_managre.model_hash import FastModelHash`
			`>>> FastModelHash.hash('/home/models/stable-diffusion-v1.5')`
			`'a8e693a126ea5b831c96064dc569956f'`
			`"""`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`import hashlib`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`import os`
			`from pathlib import Path`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`from typing import Literal, Union`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00
feat(mm): use blake3 for hashing 2024-02-27 09:51:49 +00:00			`from blake3 import blake3`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`MODEL_FILE_EXTENSIONS = (".ckpt", ".safetensors", ".bin", ".pt", ".pth")`

			`ALGORITHMS = Literal[`
			`"md5",`
			`"sha1",`
			`"sha1_fast",`
			`"sha224",`
			`"sha256",`
			`"sha384",`
			`"sha512",`
			`"blake2b",`
			`"blake2s",`
			`"sha3_224",`
			`"sha3_256",`
			`"sha3_384",`
			`"sha3_512",`
			`"shake_128",`
			`"shake_256",`
			`"blake3",`
			`]`


			`class ModelHash:`
			`"""ModelHash provides one public class method, hash()."""`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00
			`@classmethod`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`def hash(cls, model_location: Union[str, Path], algorithm: ALGORITHMS = "blake3") -> str:`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`"""`
			`Return hexdigest string for model located at model_location.`

feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`If model_location is a directory, the hash is computed by hashing the hashes of all model files in the`
			`directory. The final composite hash is always computed using BLAKE3.`

add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`:param model_location: Path to the model`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`:param algorithm: Hashing algorithm to use`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`"""`
			`model_location = Path(model_location)`
			`if model_location.is_file():`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`return cls._hash_file(model_location, algorithm)`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`elif model_location.is_dir():`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`return cls._hash_dir(model_location, algorithm)`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`else:`
			`raise OSError(f"Not a valid file or directory: {model_location}")`

			`@classmethod`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`def _hash_file(cls, model_location: Union[str, Path], algorithm: ALGORITHMS) -> str:`
make model key assignment deterministic - When installing, model keys are now calculated from the model contents. - .safetensors, .ckpt and other single file models are hashed with sha1 - The contents of diffusers directories are hashed using imohash (faster) fixup yaml->sql db migration script to assign deterministic key - this commit also detects and assigns the correct image encoder for ip adapter models. 2024-02-24 15:22:22 +00:00			`"""`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`Compute the hash for a single file and return its hexdigest.`
make model key assignment deterministic - When installing, model keys are now calculated from the model contents. - .safetensors, .ckpt and other single file models are hashed with sha1 - The contents of diffusers directories are hashed using imohash (faster) fixup yaml->sql db migration script to assign deterministic key - this commit also detects and assigns the correct image encoder for ip adapter models. 2024-02-24 15:22:22 +00:00
			`:param model_location: Path to the model file`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`:param algorithm: Hashing algorithm to use`
make model key assignment deterministic - When installing, model keys are now calculated from the model contents. - .safetensors, .ckpt and other single file models are hashed with sha1 - The contents of diffusers directories are hashed using imohash (faster) fixup yaml->sql db migration script to assign deterministic key - this commit also detects and assigns the correct image encoder for ip adapter models. 2024-02-24 15:22:22 +00:00			`"""`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00
			`if algorithm == "blake3":`
			`return cls._blake3(model_location)`
			`elif algorithm == "sha1_fast":`
			`return cls._sha1_fast(model_location)`
			`elif algorithm in hashlib.algorithms_available:`
			`return cls._hashlib(model_location, algorithm)`
			`else:`
			`raise ValueError(f"Algorithm {algorithm} not available")`
make model key assignment deterministic - When installing, model keys are now calculated from the model contents. - .safetensors, .ckpt and other single file models are hashed with sha1 - The contents of diffusers directories are hashed using imohash (faster) fixup yaml->sql db migration script to assign deterministic key - this commit also detects and assigns the correct image encoder for ip adapter models. 2024-02-24 15:22:22 +00:00
			`@classmethod`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`def _hash_dir(cls, model_location: Union[str, Path], algorithm: ALGORITHMS) -> str:`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`"""`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`Compute the hash for all files in a directory and return a hexdigest.`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00
feat(mm): use blake3 for hashing 2024-02-27 09:51:49 +00:00			`:param model_location: Path to the model directory`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`:param algorithm: Hashing algorithm to use`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`"""`
feat(mm): use blake3 for hashing 2024-02-27 09:51:49 +00:00			`components: list[str] = []`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00
chore: ruff lint 2023-11-13 20:57:07 +00:00			`for root, _dirs, files in os.walk(model_location):`
add sql-based model config store and api 2023-11-05 03:03:26 +00:00			`for file in files:`
			`# only tally tensor files because diffusers config files change slightly`
			`# depending on how the model was downloaded/converted.`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`if file.endswith(MODEL_FILE_EXTENSIONS):`
			`components.append((Path(root, file).as_posix()))`
feat(mm): use blake3 for hashing 2024-02-27 09:51:49 +00:00
			`component_hashes: list[str] = []`
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`for component in sorted(components):`
			`component_hashes.append(cls._hash_file(component, algorithm))`

			`# BLAKE3 is cryptographically secure. We may as well fall back on a secure algorithm`
			`# for the composite hash`
			`composite_hasher = blake3()`
			`for h in components:`
			`composite_hasher.update(h.encode("utf-8"))`
			`return composite_hasher.hexdigest()`

			`@staticmethod`
			`def _blake3(file_path: Union[str, Path]) -> str:`
			`"""Hashes a file using BLAKE3"""`
			`file_hasher = blake3(max_threads=blake3.AUTO)`
			`file_hasher.update_mmap(file_path)`
			`return file_hasher.hexdigest()`
feat(mm): use blake3 for hashing 2024-02-27 09:51:49 +00:00
feat(mm): add hashing algos to ModelHash - Some algos are slow, so it is now just called ModelHash - Added all hashlib algos, plus BLAKE3 and the fast (but incorrect) SHA1 algo 2024-02-27 14:50:05 +00:00			`@staticmethod`
			`def _sha1_fast(file_path: Union[str, Path]) -> str:`
			`"""Hashes a file using SHA1, but with a block size of 2**16. The result is not a standard SHA1 hash due to the`
			`# padding introduced by the block size. The algorithm is, however, very fast."""`
			`BLOCK_SIZE = 2**16`
			`file_hash = hashlib.sha1()`
			`with open(file_path, "rb") as f:`
			`data = f.read(BLOCK_SIZE)`
			`file_hash.update(data)`
			`return file_hash.hexdigest()`

			`@staticmethod`
			`def _hashlib(file_path: Union[str, Path], algorithm: ALGORITHMS) -> str:`
			`"""Hashes a file using a hashlib algorithm"""`
			`file_hasher = hashlib.new(algorithm)`
			`with open(file_path, "rb") as f:`
			`file_hasher.update(f.read())`
			`return file_hasher.hexdigest()`