mirror of
https://github.com/immich-app/immich.git
synced 2025-01-09 21:36:46 +01:00
38 lines
995 B
Python
38 lines
995 B
Python
|
from pathlib import Path
|
||
|
|
||
|
from PIL.Image import Image
|
||
|
from sentence_transformers import SentenceTransformer
|
||
|
|
||
|
from ..schemas import ModelType
|
||
|
from .base import InferenceModel
|
||
|
|
||
|
|
||
|
class CLIPSTEncoder(InferenceModel):
|
||
|
_model_type = ModelType.CLIP
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
model_name: str,
|
||
|
cache_dir: Path | None = None,
|
||
|
**model_kwargs,
|
||
|
):
|
||
|
super().__init__(model_name, cache_dir)
|
||
|
self.model = SentenceTransformer(
|
||
|
self.model_name,
|
||
|
cache_folder=self.cache_dir.as_posix(),
|
||
|
**model_kwargs,
|
||
|
)
|
||
|
|
||
|
def predict(self, image_or_text: Image | str) -> list[float]:
|
||
|
return self.model.encode(image_or_text).tolist()
|
||
|
|
||
|
|
||
|
# stubs to allow different behavior between the two in the future
|
||
|
# and handle loading different image and text clip models
|
||
|
class CLIPSTVisionEncoder(CLIPSTEncoder):
|
||
|
_model_type = ModelType.CLIP_VISION
|
||
|
|
||
|
|
||
|
class CLIPSTTextEncoder(CLIPSTEncoder):
|
||
|
_model_type = ModelType.CLIP_TEXT
|