immich/machine-learning/app/models/clip.py

from pathlib import Path

from PIL.Image import Image
from sentence_transformers import SentenceTransformer

from ..schemas import ModelType
from .base import InferenceModel


class CLIPSTEncoder(InferenceModel):
    _model_type = ModelType.CLIP

    def __init__(
        self,
        model_name: str,
        cache_dir: Path | None = None,
        **model_kwargs,
    ):
        super().__init__(model_name, cache_dir)
        self.model = SentenceTransformer(
            self.model_name,
            cache_folder=self.cache_dir.as_posix(),
            **model_kwargs,
        )

    def predict(self, image_or_text: Image | str) -> list[float]:
        return self.model.encode(image_or_text).tolist()


# stubs to allow different behavior between the two in the future
# and handle loading different image and text clip models
class CLIPSTVisionEncoder(CLIPSTEncoder):
    _model_type = ModelType.CLIP_VISION


class CLIPSTTextEncoder(CLIPSTEncoder):
    _model_type = ModelType.CLIP_TEXT
refactor(ml): modularization and styling (#2835) * basic refactor and styling * removed batching * module entrypoint * removed unused imports * model superclass, model cache now in app state * fixed cache dir and enforced abstract method --------- Co-authored-by: Alex Tran <alex.tran1502@gmail.com> 2023-06-25 05:18:09 +02:00			`from pathlib import Path`

			`from PIL.Image import Image`
			`from sentence_transformers import SentenceTransformer`

			`from ..schemas import ModelType`
			`from .base import InferenceModel`


			`class CLIPSTEncoder(InferenceModel):`
			`_model_type = ModelType.CLIP`

			`def __init__(`
			`self,`
			`model_name: str,`
			`cache_dir: Path \| None = None,`
			`**model_kwargs,`
			`):`
			`super().__init__(model_name, cache_dir)`
			`self.model = SentenceTransformer(`
			`self.model_name,`
			`cache_folder=self.cache_dir.as_posix(),`
			`**model_kwargs,`
			`)`

			`def predict(self, image_or_text: Image \| str) -> list[float]:`
			`return self.model.encode(image_or_text).tolist()`


			`# stubs to allow different behavior between the two in the future`
			`# and handle loading different image and text clip models`
			`class CLIPSTVisionEncoder(CLIPSTEncoder):`
			`_model_type = ModelType.CLIP_VISION`


			`class CLIPSTTextEncoder(CLIPSTEncoder):`
			`_model_type = ModelType.CLIP_TEXT`