diff --git a/machine-learning/app/config.py b/machine-learning/app/config.py index ab27499a25..9b98eecded 100644 --- a/machine-learning/app/config.py +++ b/machine-learning/app/config.py @@ -12,8 +12,6 @@ from rich.logging import RichHandler from uvicorn import Server from uvicorn.workers import UvicornWorker -from .schemas import ModelType - class PreloadModelData(BaseModel): clip: str | None @@ -21,7 +19,7 @@ class PreloadModelData(BaseModel): class Settings(BaseSettings): - cache_folder: str = "/cache" + cache_folder: Path = Path("/cache") model_ttl: int = 300 model_ttl_poll_s: int = 10 host: str = "0.0.0.0" @@ -55,14 +53,6 @@ def clean_name(model_name: str) -> str: return model_name.split("/")[-1].translate(_clean_name) -def get_cache_dir(model_name: str, model_type: ModelType) -> Path: - return Path(settings.cache_folder) / model_type.value / clean_name(model_name) - - -def get_hf_model_name(model_name: str) -> str: - return f"immich-app/{clean_name(model_name)}" - - LOG_LEVELS: dict[str, int] = { "critical": logging.ERROR, "error": logging.ERROR, diff --git a/machine-learning/app/main.py b/machine-learning/app/main.py index 277ad76898..3c607015d9 100644 --- a/machine-learning/app/main.py +++ b/machine-learning/app/main.py @@ -6,22 +6,34 @@ import threading import time from concurrent.futures import ThreadPoolExecutor from contextlib import asynccontextmanager +from functools import partial from typing import Any, AsyncGenerator, Callable, Iterator from zipfile import BadZipFile import orjson -from fastapi import Depends, FastAPI, Form, HTTPException, UploadFile +from fastapi import Depends, FastAPI, File, Form, HTTPException from fastapi.responses import ORJSONResponse from onnxruntime.capi.onnxruntime_pybind11_state import InvalidProtobuf, NoSuchFile +from PIL.Image import Image +from pydantic import ValidationError from starlette.formparsers import MultiPartParser +from app.models import get_model_deps from app.models.base import InferenceModel +from app.models.transforms import decode_pil from .config import PreloadModelData, log, settings from .models.cache import ModelCache from .schemas import ( + InferenceEntries, + InferenceEntry, + InferenceResponse, MessageResponse, + ModelIdentity, + ModelTask, ModelType, + PipelineRequest, + T, TextResponse, ) @@ -63,12 +75,21 @@ async def lifespan(_: FastAPI) -> AsyncGenerator[None, None]: gc.collect() -async def preload_models(preload_models: PreloadModelData) -> None: - log.info(f"Preloading models: {preload_models}") - if preload_models.clip is not None: - await load(await model_cache.get(preload_models.clip, ModelType.CLIP)) - if preload_models.facial_recognition is not None: - await load(await model_cache.get(preload_models.facial_recognition, ModelType.FACIAL_RECOGNITION)) +async def preload_models(preload: PreloadModelData) -> None: + log.info(f"Preloading models: {preload}") + if preload.clip is not None: + model = await model_cache.get(preload.clip, ModelType.TEXTUAL, ModelTask.SEARCH) + await load(model) + + model = await model_cache.get(preload.clip, ModelType.VISUAL, ModelTask.SEARCH) + await load(model) + + if preload.facial_recognition is not None: + model = await model_cache.get(preload.facial_recognition, ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION) + await load(model) + + model = await model_cache.get(preload.facial_recognition, ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION) + await load(model) def update_state() -> Iterator[None]: @@ -81,6 +102,27 @@ def update_state() -> Iterator[None]: active_requests -= 1 +def get_entries(entries: str = Form()) -> InferenceEntries: + try: + request: PipelineRequest = orjson.loads(entries) + without_deps: list[InferenceEntry] = [] + with_deps: list[InferenceEntry] = [] + for task, types in request.items(): + for type, entry in types.items(): + parsed: InferenceEntry = { + "name": entry["modelName"], + "task": task, + "type": type, + "options": entry.get("options", {}), + } + dep = get_model_deps(parsed["name"], type, task) + (with_deps if dep else without_deps).append(parsed) + return without_deps, with_deps + except (orjson.JSONDecodeError, ValidationError, KeyError, AttributeError) as e: + log.error(f"Invalid request format: {e}") + raise HTTPException(422, "Invalid request format.") + + app = FastAPI(lifespan=lifespan) @@ -96,42 +138,63 @@ def ping() -> str: @app.post("/predict", dependencies=[Depends(update_state)]) async def predict( - model_name: str = Form(alias="modelName"), - model_type: ModelType = Form(alias="modelType"), - options: str = Form(default="{}"), + entries: InferenceEntries = Depends(get_entries), + image: bytes | None = File(default=None), text: str | None = Form(default=None), - image: UploadFile | None = None, ) -> Any: if image is not None: - inputs: str | bytes = await image.read() + inputs: Image | str = await run(lambda: decode_pil(image)) elif text is not None: inputs = text else: raise HTTPException(400, "Either image or text must be provided") - try: - kwargs = orjson.loads(options) - except orjson.JSONDecodeError: - raise HTTPException(400, f"Invalid options JSON: {options}") - - model = await load(await model_cache.get(model_name, model_type, ttl=settings.model_ttl, **kwargs)) - model.configure(**kwargs) - outputs = await run(model.predict, inputs) - return ORJSONResponse(outputs) + response = await run_inference(inputs, entries) + return ORJSONResponse(response) -async def run(func: Callable[..., Any], inputs: Any) -> Any: +async def run_inference(payload: Image | str, entries: InferenceEntries) -> InferenceResponse: + outputs: dict[ModelIdentity, Any] = {} + response: InferenceResponse = {} + + async def _run_inference(entry: InferenceEntry) -> None: + model = await model_cache.get(entry["name"], entry["type"], entry["task"], ttl=settings.model_ttl) + inputs = [payload] + for dep in model.depends: + try: + inputs.append(outputs[dep]) + except KeyError: + message = f"Task {entry['task']} of type {entry['type']} depends on output of {dep}" + raise HTTPException(400, message) + model = await load(model) + output = await run(model.predict, *inputs, **entry["options"]) + outputs[model.identity] = output + response[entry["task"]] = output + + without_deps, with_deps = entries + await asyncio.gather(*[_run_inference(entry) for entry in without_deps]) + if with_deps: + await asyncio.gather(*[_run_inference(entry) for entry in with_deps]) + if isinstance(payload, Image): + response["imageHeight"], response["imageWidth"] = payload.height, payload.width + + return response + + +async def run(func: Callable[..., T], *args: Any, **kwargs: Any) -> T: if thread_pool is None: - return func(inputs) - return await asyncio.get_running_loop().run_in_executor(thread_pool, func, inputs) + return func(*args, **kwargs) + partial_func = partial(func, *args, **kwargs) + return await asyncio.get_running_loop().run_in_executor(thread_pool, partial_func) async def load(model: InferenceModel) -> InferenceModel: if model.loaded: return model - def _load(model: InferenceModel) -> None: + def _load(model: InferenceModel) -> InferenceModel: with lock: model.load() + return model try: await run(_load, model) diff --git a/machine-learning/app/models/__init__.py b/machine-learning/app/models/__init__.py index 18d75e1632..25e726c64e 100644 --- a/machine-learning/app/models/__init__.py +++ b/machine-learning/app/models/__init__.py @@ -1,24 +1,40 @@ from typing import Any -from app.schemas import ModelType +from app.models.base import InferenceModel +from app.models.clip.textual import MClipTextualEncoder, OpenClipTextualEncoder +from app.models.clip.visual import OpenClipVisualEncoder +from app.schemas import ModelSource, ModelTask, ModelType -from .base import InferenceModel -from .clip import MCLIPEncoder, OpenCLIPEncoder -from .constants import is_insightface, is_mclip, is_openclip -from .facial_recognition import FaceRecognizer +from .constants import get_model_source +from .facial_recognition.detection import FaceDetector +from .facial_recognition.recognition import FaceRecognizer -def from_model_type(model_type: ModelType, model_name: str, **model_kwargs: Any) -> InferenceModel: - match model_type: - case ModelType.CLIP: - if is_openclip(model_name): - return OpenCLIPEncoder(model_name, **model_kwargs) - elif is_mclip(model_name): - return MCLIPEncoder(model_name, **model_kwargs) - case ModelType.FACIAL_RECOGNITION: - if is_insightface(model_name): - return FaceRecognizer(model_name, **model_kwargs) +def get_model_class(model_name: str, model_type: ModelType, model_task: ModelTask) -> type[InferenceModel]: + source = get_model_source(model_name) + match source, model_type, model_task: + case ModelSource.OPENCLIP | ModelSource.MCLIP, ModelType.VISUAL, ModelTask.SEARCH: + return OpenClipVisualEncoder + + case ModelSource.OPENCLIP, ModelType.TEXTUAL, ModelTask.SEARCH: + return OpenClipTextualEncoder + + case ModelSource.MCLIP, ModelType.TEXTUAL, ModelTask.SEARCH: + return MClipTextualEncoder + + case ModelSource.INSIGHTFACE, ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION: + return FaceDetector + + case ModelSource.INSIGHTFACE, ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION: + return FaceRecognizer + case _: - raise ValueError(f"Unknown model type {model_type}") + raise ValueError(f"Unknown model combination: {source}, {model_type}, {model_task}") - raise ValueError(f"Unknown {model_type} model {model_name}") + +def from_model_type(model_name: str, model_type: ModelType, model_task: ModelTask, **kwargs: Any) -> InferenceModel: + return get_model_class(model_name, model_type, model_task)(model_name, **kwargs) + + +def get_model_deps(model_name: str, model_type: ModelType, model_task: ModelTask) -> list[tuple[ModelType, ModelTask]]: + return get_model_class(model_name, model_type, model_task).depends diff --git a/machine-learning/app/models/base.py b/machine-learning/app/models/base.py index 16adc159ad..f64a873010 100644 --- a/machine-learning/app/models/base.py +++ b/machine-learning/app/models/base.py @@ -3,7 +3,7 @@ from __future__ import annotations from abc import ABC, abstractmethod from pathlib import Path from shutil import rmtree -from typing import Any +from typing import Any, ClassVar import onnxruntime as ort from huggingface_hub import snapshot_download @@ -11,13 +11,14 @@ from huggingface_hub import snapshot_download import ann.ann from app.models.constants import SUPPORTED_PROVIDERS -from ..config import get_cache_dir, get_hf_model_name, log, settings -from ..schemas import ModelRuntime, ModelType +from ..config import clean_name, log, settings +from ..schemas import ModelFormat, ModelIdentity, ModelSession, ModelTask, ModelType from .ann import AnnSession class InferenceModel(ABC): - _model_type: ModelType + depends: ClassVar[list[ModelIdentity]] + identity: ClassVar[ModelIdentity] def __init__( self, @@ -26,16 +27,16 @@ class InferenceModel(ABC): providers: list[str] | None = None, provider_options: list[dict[str, Any]] | None = None, sess_options: ort.SessionOptions | None = None, - preferred_runtime: ModelRuntime | None = None, + preferred_format: ModelFormat | None = None, **model_kwargs: Any, ) -> None: self.loaded = False - self.model_name = model_name + self.model_name = clean_name(model_name) self.cache_dir = Path(cache_dir) if cache_dir is not None else self.cache_dir_default self.providers = providers if providers is not None else self.providers_default self.provider_options = provider_options if provider_options is not None else self.provider_options_default self.sess_options = sess_options if sess_options is not None else self.sess_options_default - self.preferred_runtime = preferred_runtime if preferred_runtime is not None else self.preferred_runtime_default + self.preferred_format = preferred_format if preferred_format is not None else self.preferred_format_default def download(self) -> None: if not self.cached: @@ -47,35 +48,36 @@ class InferenceModel(ABC): def load(self) -> None: if self.loaded: return + self.download() log.info(f"Loading {self.model_type.replace('-', ' ')} model '{self.model_name}' to memory") - self._load() + self.session = self._load() self.loaded = True - def predict(self, inputs: Any, **model_kwargs: Any) -> Any: + def predict(self, *inputs: Any, **model_kwargs: Any) -> Any: self.load() if model_kwargs: self.configure(**model_kwargs) - return self._predict(inputs) + return self._predict(*inputs, **model_kwargs) @abstractmethod - def _predict(self, inputs: Any) -> Any: ... + def _predict(self, *inputs: Any, **model_kwargs: Any) -> Any: ... - def configure(self, **model_kwargs: Any) -> None: + def configure(self, **kwargs: Any) -> None: pass def _download(self) -> None: - ignore_patterns = [] if self.preferred_runtime == ModelRuntime.ARMNN else ["*.armnn"] + ignore_patterns = [] if self.preferred_format == ModelFormat.ARMNN else ["*.armnn"] snapshot_download( - get_hf_model_name(self.model_name), + f"immich-app/{clean_name(self.model_name)}", cache_dir=self.cache_dir, local_dir=self.cache_dir, local_dir_use_symlinks=False, ignore_patterns=ignore_patterns, ) - @abstractmethod - def _load(self) -> None: ... + def _load(self) -> ModelSession: + return self._make_session(self.model_path) def clear_cache(self) -> None: if not self.cache_dir.exists(): @@ -99,7 +101,7 @@ class InferenceModel(ABC): self.cache_dir.unlink() self.cache_dir.mkdir(parents=True, exist_ok=True) - def _make_session(self, model_path: Path) -> AnnSession | ort.InferenceSession: + def _make_session(self, model_path: Path) -> ModelSession: if not model_path.is_file(): onnx_path = model_path.with_suffix(".onnx") if not onnx_path.is_file(): @@ -124,9 +126,21 @@ class InferenceModel(ABC): raise ValueError(f"Unsupported model file type: {model_path.suffix}") return session + @property + def model_dir(self) -> Path: + return self.cache_dir / self.model_type.value + + @property + def model_path(self) -> Path: + return self.model_dir / f"model.{self.preferred_format}" + + @property + def model_task(self) -> ModelTask: + return self.identity[1] + @property def model_type(self) -> ModelType: - return self._model_type + return self.identity[0] @property def cache_dir(self) -> Path: @@ -138,11 +152,11 @@ class InferenceModel(ABC): @property def cache_dir_default(self) -> Path: - return get_cache_dir(self.model_name, self.model_type) + return settings.cache_folder / self.model_task.value / self.model_name @property def cached(self) -> bool: - return self.cache_dir.is_dir() and any(self.cache_dir.iterdir()) + return self.model_path.is_file() @property def providers(self) -> list[str]: @@ -226,14 +240,14 @@ class InferenceModel(ABC): return sess_options @property - def preferred_runtime(self) -> ModelRuntime: - return self._preferred_runtime + def preferred_format(self) -> ModelFormat: + return self._preferred_format - @preferred_runtime.setter - def preferred_runtime(self, preferred_runtime: ModelRuntime) -> None: - log.debug(f"Setting preferred runtime to {preferred_runtime}") - self._preferred_runtime = preferred_runtime + @preferred_format.setter + def preferred_format(self, preferred_format: ModelFormat) -> None: + log.debug(f"Setting preferred format to {preferred_format}") + self._preferred_format = preferred_format @property - def preferred_runtime_default(self) -> ModelRuntime: - return ModelRuntime.ARMNN if ann.ann.is_available and settings.ann else ModelRuntime.ONNX + def preferred_format_default(self) -> ModelFormat: + return ModelFormat.ARMNN if ann.ann.is_available and settings.ann else ModelFormat.ONNX diff --git a/machine-learning/app/models/cache.py b/machine-learning/app/models/cache.py index 781a9caea0..bf8e8a6352 100644 --- a/machine-learning/app/models/cache.py +++ b/machine-learning/app/models/cache.py @@ -5,9 +5,9 @@ from aiocache.lock import OptimisticLock from aiocache.plugins import TimingPlugin from app.models import from_model_type +from app.models.base import InferenceModel -from ..schemas import ModelType, has_profiling -from .base import InferenceModel +from ..schemas import ModelTask, ModelType, has_profiling class ModelCache: @@ -31,28 +31,21 @@ class ModelCache: if profiling: plugins.append(TimingPlugin()) - self.revalidate_enable = revalidate + self.should_revalidate = revalidate self.cache = SimpleMemoryCache(timeout=timeout, plugins=plugins, namespace=None) - async def get(self, model_name: str, model_type: ModelType, **model_kwargs: Any) -> InferenceModel: - """ - Args: - model_name: Name of model in the model hub used for the task. - model_type: Model type or task, which determines which model zoo is used. - - Returns: - model: The requested model. - """ - - key = f"{model_name}{model_type.value}{model_kwargs.get('mode', '')}" + async def get( + self, model_name: str, model_type: ModelType, model_task: ModelTask, **model_kwargs: Any + ) -> InferenceModel: + key = f"{model_name}{model_type}{model_task}" async with OptimisticLock(self.cache, key) as lock: model: InferenceModel | None = await self.cache.get(key) if model is None: - model = from_model_type(model_type, model_name, **model_kwargs) + model = from_model_type(model_name, model_type, model_task, **model_kwargs) await lock.cas(model, ttl=model_kwargs.get("ttl", None)) - elif self.revalidate_enable: + elif self.should_revalidate: await self.revalidate(key, model_kwargs.get("ttl", None)) return model diff --git a/machine-learning/app/models/clip.py b/machine-learning/app/models/clip.py deleted file mode 100644 index b0d73175a9..0000000000 --- a/machine-learning/app/models/clip.py +++ /dev/null @@ -1,189 +0,0 @@ -import json -from abc import abstractmethod -from functools import cached_property -from io import BytesIO -from pathlib import Path -from typing import Any, Literal - -import numpy as np -from numpy.typing import NDArray -from PIL import Image -from tokenizers import Encoding, Tokenizer - -from app.config import clean_name, log -from app.models.transforms import crop, get_pil_resampling, normalize, resize, to_numpy -from app.schemas import ModelType - -from .base import InferenceModel - - -class BaseCLIPEncoder(InferenceModel): - _model_type = ModelType.CLIP - - def __init__( - self, - model_name: str, - cache_dir: Path | str | None = None, - mode: Literal["text", "vision"] | None = None, - **model_kwargs: Any, - ) -> None: - self.mode = mode - super().__init__(model_name, cache_dir, **model_kwargs) - - def _load(self) -> None: - if self.mode == "text" or self.mode is None: - log.debug(f"Loading clip text model '{self.model_name}'") - self.text_model = self._make_session(self.textual_path) - log.debug(f"Loaded clip text model '{self.model_name}'") - - if self.mode == "vision" or self.mode is None: - log.debug(f"Loading clip vision model '{self.model_name}'") - self.vision_model = self._make_session(self.visual_path) - log.debug(f"Loaded clip vision model '{self.model_name}'") - - def _predict(self, image_or_text: Image.Image | str) -> NDArray[np.float32]: - if isinstance(image_or_text, bytes): - image_or_text = Image.open(BytesIO(image_or_text)) - - match image_or_text: - case Image.Image(): - if self.mode == "text": - raise TypeError("Cannot encode image as text-only model") - outputs: NDArray[np.float32] = self.vision_model.run(None, self.transform(image_or_text))[0][0] - case str(): - if self.mode == "vision": - raise TypeError("Cannot encode text as vision-only model") - outputs = self.text_model.run(None, self.tokenize(image_or_text))[0][0] - case _: - raise TypeError(f"Expected Image or str, but got: {type(image_or_text)}") - - return outputs - - @abstractmethod - def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]: - pass - - @abstractmethod - def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]: - pass - - @property - def textual_dir(self) -> Path: - return self.cache_dir / "textual" - - @property - def visual_dir(self) -> Path: - return self.cache_dir / "visual" - - @property - def model_cfg_path(self) -> Path: - return self.cache_dir / "config.json" - - @property - def textual_path(self) -> Path: - return self.textual_dir / f"model.{self.preferred_runtime}" - - @property - def visual_path(self) -> Path: - return self.visual_dir / f"model.{self.preferred_runtime}" - - @property - def tokenizer_file_path(self) -> Path: - return self.textual_dir / "tokenizer.json" - - @property - def tokenizer_cfg_path(self) -> Path: - return self.textual_dir / "tokenizer_config.json" - - @property - def preprocess_cfg_path(self) -> Path: - return self.visual_dir / "preprocess_cfg.json" - - @property - def cached(self) -> bool: - return self.textual_path.is_file() and self.visual_path.is_file() - - @cached_property - def model_cfg(self) -> dict[str, Any]: - log.debug(f"Loading model config for CLIP model '{self.model_name}'") - model_cfg: dict[str, Any] = json.load(self.model_cfg_path.open()) - log.debug(f"Loaded model config for CLIP model '{self.model_name}'") - return model_cfg - - @cached_property - def tokenizer_file(self) -> dict[str, Any]: - log.debug(f"Loading tokenizer file for CLIP model '{self.model_name}'") - tokenizer_file: dict[str, Any] = json.load(self.tokenizer_file_path.open()) - log.debug(f"Loaded tokenizer file for CLIP model '{self.model_name}'") - return tokenizer_file - - @cached_property - def tokenizer_cfg(self) -> dict[str, Any]: - log.debug(f"Loading tokenizer config for CLIP model '{self.model_name}'") - tokenizer_cfg: dict[str, Any] = json.load(self.tokenizer_cfg_path.open()) - log.debug(f"Loaded tokenizer config for CLIP model '{self.model_name}'") - return tokenizer_cfg - - @cached_property - def preprocess_cfg(self) -> dict[str, Any]: - log.debug(f"Loading visual preprocessing config for CLIP model '{self.model_name}'") - preprocess_cfg: dict[str, Any] = json.load(self.preprocess_cfg_path.open()) - log.debug(f"Loaded visual preprocessing config for CLIP model '{self.model_name}'") - return preprocess_cfg - - -class OpenCLIPEncoder(BaseCLIPEncoder): - def __init__( - self, - model_name: str, - cache_dir: Path | str | None = None, - mode: Literal["text", "vision"] | None = None, - **model_kwargs: Any, - ) -> None: - super().__init__(clean_name(model_name), cache_dir, mode, **model_kwargs) - - def _load(self) -> None: - super()._load() - self._load_tokenizer() - - size: list[int] | int = self.preprocess_cfg["size"] - self.size = size[0] if isinstance(size, list) else size - - self.resampling = get_pil_resampling(self.preprocess_cfg["interpolation"]) - self.mean = np.array(self.preprocess_cfg["mean"], dtype=np.float32) - self.std = np.array(self.preprocess_cfg["std"], dtype=np.float32) - - def _load_tokenizer(self) -> Tokenizer: - log.debug(f"Loading tokenizer for CLIP model '{self.model_name}'") - - text_cfg: dict[str, Any] = self.model_cfg["text_cfg"] - context_length: int = text_cfg.get("context_length", 77) - pad_token: str = self.tokenizer_cfg["pad_token"] - - self.tokenizer: Tokenizer = Tokenizer.from_file(self.tokenizer_file_path.as_posix()) - - pad_id: int = self.tokenizer.token_to_id(pad_token) - self.tokenizer.enable_padding(length=context_length, pad_token=pad_token, pad_id=pad_id) - self.tokenizer.enable_truncation(max_length=context_length) - - log.debug(f"Loaded tokenizer for CLIP model '{self.model_name}'") - - def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]: - tokens: Encoding = self.tokenizer.encode(text) - return {"text": np.array([tokens.ids], dtype=np.int32)} - - def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]: - image = resize(image, self.size) - image = crop(image, self.size) - image_np = to_numpy(image) - image_np = normalize(image_np, self.mean, self.std) - return {"image": np.expand_dims(image_np.transpose(2, 0, 1), 0)} - - -class MCLIPEncoder(OpenCLIPEncoder): - def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]: - tokens: Encoding = self.tokenizer.encode(text) - return { - "input_ids": np.array([tokens.ids], dtype=np.int32), - "attention_mask": np.array([tokens.attention_mask], dtype=np.int32), - } diff --git a/machine-learning/app/models/clip/textual.py b/machine-learning/app/models/clip/textual.py new file mode 100644 index 0000000000..4d86bf05b0 --- /dev/null +++ b/machine-learning/app/models/clip/textual.py @@ -0,0 +1,98 @@ +import json +from abc import abstractmethod +from functools import cached_property +from pathlib import Path +from typing import Any + +import numpy as np +from numpy.typing import NDArray +from tokenizers import Encoding, Tokenizer + +from app.config import log +from app.models.base import InferenceModel +from app.schemas import ModelSession, ModelTask, ModelType + + +class BaseCLIPTextualEncoder(InferenceModel): + depends = [] + identity = (ModelType.TEXTUAL, ModelTask.SEARCH) + + def _predict(self, inputs: str, **kwargs: Any) -> NDArray[np.float32]: + res: NDArray[np.float32] = self.session.run(None, self.tokenize(inputs))[0][0] + return res + + def _load(self) -> ModelSession: + log.debug(f"Loading tokenizer for CLIP model '{self.model_name}'") + self.tokenizer = self._load_tokenizer() + log.debug(f"Loaded tokenizer for CLIP model '{self.model_name}'") + + return super()._load() + + @abstractmethod + def _load_tokenizer(self) -> Tokenizer: + pass + + @abstractmethod + def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]: + pass + + @property + def model_cfg_path(self) -> Path: + return self.cache_dir / "config.json" + + @property + def tokenizer_file_path(self) -> Path: + return self.model_dir / "tokenizer.json" + + @property + def tokenizer_cfg_path(self) -> Path: + return self.model_dir / "tokenizer_config.json" + + @cached_property + def model_cfg(self) -> dict[str, Any]: + log.debug(f"Loading model config for CLIP model '{self.model_name}'") + model_cfg: dict[str, Any] = json.load(self.model_cfg_path.open()) + log.debug(f"Loaded model config for CLIP model '{self.model_name}'") + return model_cfg + + @cached_property + def tokenizer_file(self) -> dict[str, Any]: + log.debug(f"Loading tokenizer file for CLIP model '{self.model_name}'") + tokenizer_file: dict[str, Any] = json.load(self.tokenizer_file_path.open()) + log.debug(f"Loaded tokenizer file for CLIP model '{self.model_name}'") + return tokenizer_file + + @cached_property + def tokenizer_cfg(self) -> dict[str, Any]: + log.debug(f"Loading tokenizer config for CLIP model '{self.model_name}'") + tokenizer_cfg: dict[str, Any] = json.load(self.tokenizer_cfg_path.open()) + log.debug(f"Loaded tokenizer config for CLIP model '{self.model_name}'") + return tokenizer_cfg + + +class OpenClipTextualEncoder(BaseCLIPTextualEncoder): + def _load_tokenizer(self) -> Tokenizer: + text_cfg: dict[str, Any] = self.model_cfg["text_cfg"] + context_length: int = text_cfg.get("context_length", 77) + pad_token: str = self.tokenizer_cfg["pad_token"] + + tokenizer: Tokenizer = Tokenizer.from_file(self.tokenizer_file_path.as_posix()) + + pad_id: int = tokenizer.token_to_id(pad_token) + tokenizer.enable_padding(length=context_length, pad_token=pad_token, pad_id=pad_id) + tokenizer.enable_truncation(max_length=context_length) + + return tokenizer + + def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]: + tokens: Encoding = self.tokenizer.encode(text) + return {"text": np.array([tokens.ids], dtype=np.int32)} + + +class MClipTextualEncoder(OpenClipTextualEncoder): + def tokenize(self, text: str) -> dict[str, NDArray[np.int32]]: + tokens: Encoding = self.tokenizer.encode(text) + return { + "input_ids": np.array([tokens.ids], dtype=np.int32), + "attention_mask": np.array([tokens.attention_mask], dtype=np.int32), + } diff --git a/machine-learning/app/models/clip/visual.py b/machine-learning/app/models/clip/visual.py new file mode 100644 index 0000000000..48058c961a --- /dev/null +++ b/machine-learning/app/models/clip/visual.py @@ -0,0 +1,69 @@ +import json +from abc import abstractmethod +from functools import cached_property +from pathlib import Path +from typing import Any + +import numpy as np +from numpy.typing import NDArray +from PIL import Image + +from app.config import log +from app.models.base import InferenceModel +from app.models.transforms import crop_pil, decode_pil, get_pil_resampling, normalize, resize_pil, to_numpy +from app.schemas import ModelSession, ModelTask, ModelType + + +class BaseCLIPVisualEncoder(InferenceModel): + depends = [] + identity = (ModelType.VISUAL, ModelTask.SEARCH) + + def _predict(self, inputs: Image.Image | bytes, **kwargs: Any) -> NDArray[np.float32]: + image = decode_pil(inputs) + res: NDArray[np.float32] = self.session.run(None, self.transform(image))[0][0] + return res + + @abstractmethod + def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]: + pass + + @property + def model_cfg_path(self) -> Path: + return self.cache_dir / "config.json" + + @property + def preprocess_cfg_path(self) -> Path: + return self.model_dir / "preprocess_cfg.json" + + @cached_property + def model_cfg(self) -> dict[str, Any]: + log.debug(f"Loading model config for CLIP model '{self.model_name}'") + model_cfg: dict[str, Any] = json.load(self.model_cfg_path.open()) + log.debug(f"Loaded model config for CLIP model '{self.model_name}'") + return model_cfg + + @cached_property + def preprocess_cfg(self) -> dict[str, Any]: + log.debug(f"Loading visual preprocessing config for CLIP model '{self.model_name}'") + preprocess_cfg: dict[str, Any] = json.load(self.preprocess_cfg_path.open()) + log.debug(f"Loaded visual preprocessing config for CLIP model '{self.model_name}'") + return preprocess_cfg + + +class OpenClipVisualEncoder(BaseCLIPVisualEncoder): + def _load(self) -> ModelSession: + size: list[int] | int = self.preprocess_cfg["size"] + self.size = size[0] if isinstance(size, list) else size + + self.resampling = get_pil_resampling(self.preprocess_cfg["interpolation"]) + self.mean = np.array(self.preprocess_cfg["mean"], dtype=np.float32) + self.std = np.array(self.preprocess_cfg["std"], dtype=np.float32) + + return super()._load() + + def transform(self, image: Image.Image) -> dict[str, NDArray[np.float32]]: + image = resize_pil(image, self.size) + image = crop_pil(image, self.size) + image_np = to_numpy(image) + image_np = normalize(image_np, self.mean, self.std) + return {"image": np.expand_dims(image_np.transpose(2, 0, 1), 0)} diff --git a/machine-learning/app/models/constants.py b/machine-learning/app/models/constants.py index b112e9279d..c51dd3b66d 100644 --- a/machine-learning/app/models/constants.py +++ b/machine-learning/app/models/constants.py @@ -1,4 +1,5 @@ from app.config import clean_name +from app.schemas import ModelSource _OPENCLIP_MODELS = { "RN50__openai", @@ -54,13 +55,16 @@ _INSIGHTFACE_MODELS = { SUPPORTED_PROVIDERS = ["CUDAExecutionProvider", "OpenVINOExecutionProvider", "CPUExecutionProvider"] -def is_openclip(model_name: str) -> bool: - return clean_name(model_name) in _OPENCLIP_MODELS +def get_model_source(model_name: str) -> ModelSource | None: + cleaned_name = clean_name(model_name) + if cleaned_name in _INSIGHTFACE_MODELS: + return ModelSource.INSIGHTFACE -def is_mclip(model_name: str) -> bool: - return clean_name(model_name) in _MCLIP_MODELS + if cleaned_name in _MCLIP_MODELS: + return ModelSource.MCLIP + if cleaned_name in _OPENCLIP_MODELS: + return ModelSource.OPENCLIP -def is_insightface(model_name: str) -> bool: - return clean_name(model_name) in _INSIGHTFACE_MODELS + return None diff --git a/machine-learning/app/models/facial_recognition.py b/machine-learning/app/models/facial_recognition.py deleted file mode 100644 index 894f5ec726..0000000000 --- a/machine-learning/app/models/facial_recognition.py +++ /dev/null @@ -1,90 +0,0 @@ -from pathlib import Path -from typing import Any - -import cv2 -import numpy as np -from insightface.model_zoo import ArcFaceONNX, RetinaFace -from insightface.utils.face_align import norm_crop -from numpy.typing import NDArray - -from app.config import clean_name -from app.schemas import Face, ModelType, is_ndarray - -from .base import InferenceModel - - -class FaceRecognizer(InferenceModel): - _model_type = ModelType.FACIAL_RECOGNITION - - def __init__( - self, - model_name: str, - min_score: float = 0.7, - cache_dir: Path | str | None = None, - **model_kwargs: Any, - ) -> None: - self.min_score = model_kwargs.pop("minScore", min_score) - super().__init__(clean_name(model_name), cache_dir, **model_kwargs) - - def _load(self) -> None: - self.det_model = RetinaFace(session=self._make_session(self.det_file)) - self.rec_model = ArcFaceONNX( - self.rec_file.with_suffix(".onnx").as_posix(), - session=self._make_session(self.rec_file), - ) - - self.det_model.prepare( - ctx_id=0, - det_thresh=self.min_score, - input_size=(640, 640), - ) - self.rec_model.prepare(ctx_id=0) - - def _predict(self, image: NDArray[np.uint8] | bytes) -> list[Face]: - if isinstance(image, bytes): - decoded_image = cv2.imdecode(np.frombuffer(image, np.uint8), cv2.IMREAD_COLOR) - else: - decoded_image = image - assert is_ndarray(decoded_image, np.uint8) - bboxes, kpss = self.det_model.detect(decoded_image) - if bboxes.size == 0: - return [] - assert is_ndarray(kpss, np.float32) - - scores = bboxes[:, 4].tolist() - bboxes = bboxes[:, :4].round().tolist() - - results = [] - height, width, _ = decoded_image.shape - for (x1, y1, x2, y2), score, kps in zip(bboxes, scores, kpss): - cropped_img = norm_crop(decoded_image, kps) - embedding: NDArray[np.float32] = self.rec_model.get_feat(cropped_img)[0] - face: Face = { - "imageWidth": width, - "imageHeight": height, - "boundingBox": { - "x1": x1, - "y1": y1, - "x2": x2, - "y2": y2, - }, - "score": score, - "embedding": embedding, - } - results.append(face) - return results - - @property - def cached(self) -> bool: - return self.det_file.is_file() and self.rec_file.is_file() - - @property - def det_file(self) -> Path: - return self.cache_dir / "detection" / f"model.{self.preferred_runtime}" - - @property - def rec_file(self) -> Path: - return self.cache_dir / "recognition" / f"model.{self.preferred_runtime}" - - def configure(self, **model_kwargs: Any) -> None: - self.det_model.det_thresh = model_kwargs.pop("minScore", self.det_model.det_thresh) diff --git a/machine-learning/app/models/facial_recognition/detection.py b/machine-learning/app/models/facial_recognition/detection.py new file mode 100644 index 0000000000..2efacd447e --- /dev/null +++ b/machine-learning/app/models/facial_recognition/detection.py @@ -0,0 +1,48 @@ +from pathlib import Path +from typing import Any + +import numpy as np +from insightface.model_zoo import RetinaFace +from numpy.typing import NDArray + +from app.models.base import InferenceModel +from app.models.transforms import decode_cv2 +from app.schemas import FaceDetectionOutput, ModelSession, ModelTask, ModelType + + +class FaceDetector(InferenceModel): + depends = [] + identity = (ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION) + + def __init__( + self, + model_name: str, + min_score: float = 0.7, + cache_dir: Path | str | None = None, + **model_kwargs: Any, + ) -> None: + self.min_score = model_kwargs.pop("minScore", min_score) + super().__init__(model_name, cache_dir, **model_kwargs) + + def _load(self) -> ModelSession: + session = self._make_session(self.model_path) + self.model = RetinaFace(session=session) + self.model.prepare(ctx_id=0, det_thresh=self.min_score, input_size=(640, 640)) + + return session + + def _predict(self, inputs: NDArray[np.uint8] | bytes, **kwargs: Any) -> FaceDetectionOutput: + inputs = decode_cv2(inputs) + + bboxes, landmarks = self._detect(inputs) + return { + "boxes": bboxes[:, :4].round(), + "scores": bboxes[:, 4], + "landmarks": landmarks, + } + + def _detect(self, inputs: NDArray[np.uint8] | bytes) -> tuple[NDArray[np.float32], NDArray[np.float32]]: + return self.model.detect(inputs) # type: ignore + + def configure(self, **kwargs: Any) -> None: + self.model.det_thresh = kwargs.pop("minScore", self.model.det_thresh) diff --git a/machine-learning/app/models/facial_recognition/recognition.py b/machine-learning/app/models/facial_recognition/recognition.py new file mode 100644 index 0000000000..cb8093dd95 --- /dev/null +++ b/machine-learning/app/models/facial_recognition/recognition.py @@ -0,0 +1,77 @@ +from pathlib import Path +from typing import Any + +import numpy as np +import onnx +import onnxruntime as ort +from insightface.model_zoo import ArcFaceONNX +from insightface.utils.face_align import norm_crop +from numpy.typing import NDArray +from onnx.tools.update_model_dims import update_inputs_outputs_dims +from PIL import Image + +from app.config import clean_name, log +from app.models.base import InferenceModel +from app.models.transforms import decode_cv2 +from app.schemas import FaceDetectionOutput, FacialRecognitionOutput, ModelSession, ModelTask, ModelType + + +class FaceRecognizer(InferenceModel): + depends = [(ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION)] + identity = (ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION) + + def __init__( + self, + model_name: str, + min_score: float = 0.7, + cache_dir: Path | str | None = None, + **model_kwargs: Any, + ) -> None: + self.min_score = model_kwargs.pop("minScore", min_score) + super().__init__(clean_name(model_name), cache_dir, **model_kwargs) + + def _load(self) -> ModelSession: + session = self._make_session(self.model_path) + if not self._has_batch_dim(session): + self._add_batch_dim(self.model_path) + session = self._make_session(self.model_path) + self.model = ArcFaceONNX( + self.model_path.with_suffix(".onnx").as_posix(), + session=session, + ) + return session + + def _predict( + self, inputs: NDArray[np.uint8] | bytes | Image.Image, faces: FaceDetectionOutput, **kwargs: Any + ) -> FacialRecognitionOutput: + if faces["boxes"].shape[0] == 0: + return [] + inputs = decode_cv2(inputs) + embeddings: NDArray[np.float32] = self.model.get_feat(self._crop(inputs, faces)) + return self.postprocess(faces, embeddings) + + def postprocess(self, faces: FaceDetectionOutput, embeddings: NDArray[np.float32]) -> FacialRecognitionOutput: + return [ + { + "boundingBox": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}, + "embedding": embedding, + "score": score, + } + for (x1, y1, x2, y2), embedding, score in zip(faces["boxes"], embeddings, faces["scores"]) + ] + + def _crop(self, image: NDArray[np.uint8], faces: FaceDetectionOutput) -> list[NDArray[np.uint8]]: + return [norm_crop(image, landmark) for landmark in faces["landmarks"]] + + def _has_batch_dim(self, session: ort.InferenceSession) -> bool: + return not isinstance(session, ort.InferenceSession) or session.get_inputs()[0].shape[0] == "batch" + + def _add_batch_dim(self, model_path: Path) -> None: + log.debug(f"Adding batch dimension to model {model_path}") + proto = onnx.load(model_path) + static_input_dims = [shape.dim_value for shape in proto.graph.input[0].type.tensor_type.shape.dim[1:]] + static_output_dims = [shape.dim_value for shape in proto.graph.output[0].type.tensor_type.shape.dim[1:]] + input_dims = {proto.graph.input[0].name: ["batch"] + static_input_dims} + output_dims = {proto.graph.output[0].name: ["batch"] + static_output_dims} + updated_proto = update_inputs_outputs_dims(proto, input_dims, output_dims) + onnx.save(updated_proto, model_path) diff --git a/machine-learning/app/models/session.py b/machine-learning/app/models/session.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/machine-learning/app/models/transforms.py b/machine-learning/app/models/transforms.py index 7ed001b8c2..ababdac99f 100644 --- a/machine-learning/app/models/transforms.py +++ b/machine-learning/app/models/transforms.py @@ -1,3 +1,7 @@ +from io import BytesIO +from typing import IO + +import cv2 import numpy as np from numpy.typing import NDArray from PIL import Image @@ -5,7 +9,7 @@ from PIL import Image _PIL_RESAMPLING_METHODS = {resampling.name.lower(): resampling for resampling in Image.Resampling} -def resize(img: Image.Image, size: int) -> Image.Image: +def resize_pil(img: Image.Image, size: int) -> Image.Image: if img.width < img.height: return img.resize((size, int((img.height / img.width) * size)), resample=Image.Resampling.BICUBIC) else: @@ -13,7 +17,7 @@ def resize(img: Image.Image, size: int) -> Image.Image: # https://stackoverflow.com/a/60883103 -def crop(img: Image.Image, size: int) -> Image.Image: +def crop_pil(img: Image.Image, size: int) -> Image.Image: left = int((img.size[0] / 2) - (size / 2)) upper = int((img.size[1] / 2) - (size / 2)) right = left + size @@ -23,14 +27,36 @@ def crop(img: Image.Image, size: int) -> Image.Image: def to_numpy(img: Image.Image) -> NDArray[np.float32]: - return np.asarray(img.convert("RGB")).astype(np.float32) / 255.0 + return np.asarray(img if img.mode == "RGB" else img.convert("RGB"), dtype=np.float32) / 255.0 def normalize( img: NDArray[np.float32], mean: float | NDArray[np.float32], std: float | NDArray[np.float32] ) -> NDArray[np.float32]: - return (img - mean) / std + return np.divide(img - mean, std, dtype=np.float32) def get_pil_resampling(resample: str) -> Image.Resampling: return _PIL_RESAMPLING_METHODS[resample.lower()] + + +def pil_to_cv2(image: Image.Image) -> NDArray[np.uint8]: + return cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) # type: ignore + + +def decode_pil(image_bytes: bytes | IO[bytes] | Image.Image) -> Image.Image: + if isinstance(image_bytes, Image.Image): + return image_bytes + image = Image.open(BytesIO(image_bytes) if isinstance(image_bytes, bytes) else image_bytes) + image.load() # type: ignore + if not image.mode == "RGB": + image = image.convert("RGB") + return image + + +def decode_cv2(image_bytes: NDArray[np.uint8] | bytes | Image.Image) -> NDArray[np.uint8]: + if isinstance(image_bytes, bytes): + image_bytes = decode_pil(image_bytes) # pillow is much faster than cv2 + if isinstance(image_bytes, Image.Image): + return pil_to_cv2(image_bytes) + return image_bytes diff --git a/machine-learning/app/schemas.py b/machine-learning/app/schemas.py index e2a027e1b8..739098a6bf 100644 --- a/machine-learning/app/schemas.py +++ b/machine-learning/app/schemas.py @@ -1,5 +1,5 @@ from enum import Enum -from typing import Any, Protocol, TypedDict, TypeGuard +from typing import Any, Literal, Protocol, TypedDict, TypeGuard, TypeVar import numpy as np import numpy.typing as npt @@ -28,31 +28,87 @@ class BoundingBox(TypedDict): y2: int -class ModelType(StrEnum): - CLIP = "clip" +class ModelTask(StrEnum): FACIAL_RECOGNITION = "facial-recognition" + SEARCH = "clip" -class ModelRuntime(StrEnum): - ONNX = "onnx" +class ModelType(StrEnum): + DETECTION = "detection" + RECOGNITION = "recognition" + TEXTUAL = "textual" + VISUAL = "visual" + + +class ModelFormat(StrEnum): ARMNN = "armnn" + ONNX = "onnx" + + +class ModelSource(StrEnum): + INSIGHTFACE = "insightface" + MCLIP = "mclip" + OPENCLIP = "openclip" + + +ModelIdentity = tuple[ModelType, ModelTask] + + +class ModelSession(Protocol): + def run( + self, + output_names: list[str] | None, + input_feed: dict[str, npt.NDArray[np.float32]] | dict[str, npt.NDArray[np.int32]], + run_options: Any = None, + ) -> list[npt.NDArray[np.float32]]: ... class HasProfiling(Protocol): profiling: dict[str, float] -class Face(TypedDict): +class FaceDetectionOutput(TypedDict): + boxes: npt.NDArray[np.float32] + scores: npt.NDArray[np.float32] + landmarks: npt.NDArray[np.float32] + + +class DetectedFace(TypedDict): boundingBox: BoundingBox embedding: npt.NDArray[np.float32] - imageWidth: int - imageHeight: int score: float +FacialRecognitionOutput = list[DetectedFace] + + +class PipelineEntry(TypedDict): + modelName: str + options: dict[str, Any] + + +PipelineRequest = dict[ModelTask, dict[ModelType, PipelineEntry]] + + +class InferenceEntry(TypedDict): + name: str + task: ModelTask + type: ModelType + options: dict[str, Any] + + +InferenceEntries = tuple[list[InferenceEntry], list[InferenceEntry]] + + +InferenceResponse = dict[ModelTask | Literal["imageHeight"] | Literal["imageWidth"], Any] + + def has_profiling(obj: Any) -> TypeGuard[HasProfiling]: return hasattr(obj, "profiling") and isinstance(obj.profiling, dict) def is_ndarray(obj: Any, dtype: "type[np._DTypeScalar_co]") -> "TypeGuard[npt.NDArray[np._DTypeScalar_co]]": return isinstance(obj, np.ndarray) and obj.dtype == dtype + + +T = TypeVar("T") diff --git a/machine-learning/app/test_main.py b/machine-learning/app/test_main.py index 22038a6544..d9d1455bd1 100644 --- a/machine-learning/app/test_main.py +++ b/machine-learning/app/test_main.py @@ -17,13 +17,15 @@ from pytest import MonkeyPatch from pytest_mock import MockerFixture from app.main import load, preload_models +from app.models.clip.textual import MClipTextualEncoder, OpenClipTextualEncoder +from app.models.clip.visual import OpenClipVisualEncoder +from app.models.facial_recognition.detection import FaceDetector +from app.models.facial_recognition.recognition import FaceRecognizer from .config import Settings, log, settings from .models.base import InferenceModel from .models.cache import ModelCache -from .models.clip import MCLIPEncoder, OpenCLIPEncoder -from .models.facial_recognition import FaceRecognizer -from .schemas import ModelRuntime, ModelType +from .schemas import ModelFormat, ModelTask, ModelType class TestBase: @@ -35,13 +37,13 @@ class TestBase: @pytest.mark.providers(CPU_EP) def test_sets_cpu_provider(self, providers: list[str]) -> None: - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") assert encoder.providers == self.CPU_EP @pytest.mark.providers(CUDA_EP) def test_sets_cuda_provider_if_available(self, providers: list[str]) -> None: - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") assert encoder.providers == self.CUDA_EP @@ -50,7 +52,7 @@ class TestBase: mocked = mocker.patch("app.models.base.ort.capi._pybind_state") mocked.get_available_openvino_device_ids.return_value = ["GPU.0", "CPU"] - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") assert encoder.providers == self.OV_EP @@ -59,25 +61,25 @@ class TestBase: mocked = mocker.patch("app.models.base.ort.capi._pybind_state") mocked.get_available_openvino_device_ids.return_value = ["CPU"] - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") assert encoder.providers == self.CPU_EP @pytest.mark.providers(CUDA_EP_OUT_OF_ORDER) def test_sets_providers_in_correct_order(self, providers: list[str]) -> None: - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") assert encoder.providers == self.CUDA_EP @pytest.mark.providers(TRT_EP) def test_ignores_unsupported_providers(self, providers: list[str]) -> None: - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") assert encoder.providers == self.CUDA_EP def test_sets_provider_kwarg(self) -> None: providers = ["CUDAExecutionProvider"] - encoder = OpenCLIPEncoder("ViT-B-32__openai", providers=providers) + encoder = OpenClipTextualEncoder("ViT-B-32__openai", providers=providers) assert encoder.providers == providers @@ -85,7 +87,9 @@ class TestBase: mocked = mocker.patch("app.models.base.ort.capi._pybind_state") mocked.get_available_openvino_device_ids.return_value = ["GPU.0", "CPU"] - encoder = OpenCLIPEncoder("ViT-B-32__openai", providers=["OpenVINOExecutionProvider", "CPUExecutionProvider"]) + encoder = OpenClipTextualEncoder( + "ViT-B-32__openai", providers=["OpenVINOExecutionProvider", "CPUExecutionProvider"] + ) assert encoder.provider_options == [ {"device_type": "GPU_FP32", "cache_dir": (encoder.cache_dir / "openvino").as_posix()}, @@ -93,7 +97,7 @@ class TestBase: ] def test_sets_provider_options_kwarg(self) -> None: - encoder = OpenCLIPEncoder( + encoder = OpenClipTextualEncoder( "ViT-B-32__openai", providers=["OpenVINOExecutionProvider", "CPUExecutionProvider"], provider_options=[], @@ -102,7 +106,7 @@ class TestBase: assert encoder.provider_options == [] def test_sets_default_sess_options(self) -> None: - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") assert encoder.sess_options.execution_mode == ort.ExecutionMode.ORT_SEQUENTIAL assert encoder.sess_options.inter_op_num_threads == 1 @@ -110,7 +114,9 @@ class TestBase: assert encoder.sess_options.enable_cpu_mem_arena is False def test_sets_default_sess_options_does_not_set_threads_if_non_cpu_and_default_threads(self) -> None: - encoder = OpenCLIPEncoder("ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"]) + encoder = OpenClipTextualEncoder( + "ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"] + ) assert encoder.sess_options.inter_op_num_threads == 0 assert encoder.sess_options.intra_op_num_threads == 0 @@ -120,14 +126,16 @@ class TestBase: mock_settings.model_inter_op_threads = 2 mock_settings.model_intra_op_threads = 4 - encoder = OpenCLIPEncoder("ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"]) + encoder = OpenClipTextualEncoder( + "ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"] + ) assert encoder.sess_options.inter_op_num_threads == 2 assert encoder.sess_options.intra_op_num_threads == 4 def test_sets_sess_options_kwarg(self) -> None: sess_options = ort.SessionOptions() - encoder = OpenCLIPEncoder( + encoder = OpenClipTextualEncoder( "ViT-B-32__openai", providers=["OpenVINOExecutionProvider", "CPUExecutionProvider"], provider_options=[], @@ -137,43 +145,43 @@ class TestBase: assert sess_options is encoder.sess_options def test_sets_default_cache_dir(self) -> None: - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") assert encoder.cache_dir == Path(settings.cache_folder) / "clip" / "ViT-B-32__openai" def test_sets_cache_dir_kwarg(self) -> None: cache_dir = Path("/test_cache") - encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=cache_dir) + encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=cache_dir) assert encoder.cache_dir == cache_dir - def test_sets_default_preferred_runtime(self, mocker: MockerFixture) -> None: + def test_sets_default_preferred_format(self, mocker: MockerFixture) -> None: mocker.patch.object(settings, "ann", True) mocker.patch("ann.ann.is_available", False) - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") - assert encoder.preferred_runtime == ModelRuntime.ONNX + assert encoder.preferred_format == ModelFormat.ONNX - def test_sets_default_preferred_runtime_to_armnn_if_available(self, mocker: MockerFixture) -> None: + def test_sets_default_preferred_format_to_armnn_if_available(self, mocker: MockerFixture) -> None: mocker.patch.object(settings, "ann", True) mocker.patch("ann.ann.is_available", True) - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") - assert encoder.preferred_runtime == ModelRuntime.ARMNN + assert encoder.preferred_format == ModelFormat.ARMNN - def test_sets_preferred_runtime_kwarg(self, mocker: MockerFixture) -> None: + def test_sets_preferred_format_kwarg(self, mocker: MockerFixture) -> None: mocker.patch.object(settings, "ann", False) mocker.patch("ann.ann.is_available", False) - encoder = OpenCLIPEncoder("ViT-B-32__openai", preferred_runtime=ModelRuntime.ARMNN) + encoder = OpenClipTextualEncoder("ViT-B-32__openai", preferred_format=ModelFormat.ARMNN) - assert encoder.preferred_runtime == ModelRuntime.ARMNN + assert encoder.preferred_format == ModelFormat.ARMNN def test_casts_cache_dir_string_to_path(self) -> None: cache_dir = "/test_cache" - encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=cache_dir) + encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=cache_dir) assert encoder.cache_dir == Path(cache_dir) @@ -186,7 +194,7 @@ class TestBase: mocker.patch("app.models.base.Path", return_value=mock_cache_dir) info = mocker.spy(log, "info") - encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir) + encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir) encoder.clear_cache() mock_rmtree.assert_called_once_with(encoder.cache_dir) @@ -201,7 +209,7 @@ class TestBase: mocker.patch("app.models.base.Path", return_value=mock_cache_dir) warning = mocker.spy(log, "warning") - encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir) + encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir) encoder.clear_cache() mock_rmtree.assert_not_called() @@ -215,7 +223,7 @@ class TestBase: mock_cache_dir.is_dir.return_value = True mocker.patch("app.models.base.Path", return_value=mock_cache_dir) - encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir) + encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir) with pytest.raises(RuntimeError): encoder.clear_cache() @@ -230,7 +238,7 @@ class TestBase: mocker.patch("app.models.base.Path", return_value=mock_cache_dir) warning = mocker.spy(log, "warning") - encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir) + encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir=mock_cache_dir) encoder.clear_cache() mock_rmtree.assert_not_called() @@ -245,7 +253,7 @@ class TestBase: mock_model_path.with_suffix.return_value = mock_model_path mock_ann = mocker.patch("app.models.base.AnnSession") - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") encoder._make_session(mock_model_path) mock_ann.assert_called_once() @@ -263,7 +271,7 @@ class TestBase: mock_ann = mocker.patch("app.models.base.AnnSession") mock_ort = mocker.patch("app.models.base.ort.InferenceSession") - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") encoder._make_session(mock_armnn_path) mock_ort.assert_called_once() @@ -277,7 +285,7 @@ class TestBase: mock_ann = mocker.patch("app.models.base.AnnSession") mock_ort = mocker.patch("app.models.base.ort.InferenceSession") - encoder = OpenCLIPEncoder("ViT-B-32__openai") + encoder = OpenClipTextualEncoder("ViT-B-32__openai") with pytest.raises(ValueError): encoder._make_session(mock_model_path) @@ -287,7 +295,7 @@ class TestBase: def test_download(self, mocker: MockerFixture) -> None: mock_snapshot_download = mocker.patch("app.models.base.snapshot_download") - encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir="/path/to/cache") + encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir="/path/to/cache") encoder.download() mock_snapshot_download.assert_called_once_with( @@ -298,10 +306,10 @@ class TestBase: ignore_patterns=["*.armnn"], ) - def test_download_downloads_armnn_if_preferred_runtime(self, mocker: MockerFixture) -> None: + def test_download_downloads_armnn_if_preferred_format(self, mocker: MockerFixture) -> None: mock_snapshot_download = mocker.patch("app.models.base.snapshot_download") - encoder = OpenCLIPEncoder("ViT-B-32__openai", preferred_runtime=ModelRuntime.ARMNN) + encoder = OpenClipTextualEncoder("ViT-B-32__openai", preferred_format=ModelFormat.ARMNN) encoder.download() mock_snapshot_download.assert_called_once_with( @@ -323,21 +331,17 @@ class TestCLIP: mocker: MockerFixture, clip_model_cfg: dict[str, Any], clip_preprocess_cfg: Callable[[Path], dict[str, Any]], - clip_tokenizer_cfg: Callable[[Path], dict[str, Any]], ) -> None: - mocker.patch.object(OpenCLIPEncoder, "download") - mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg) - mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg) - mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg) + mocker.patch.object(OpenClipVisualEncoder, "download") + mocker.patch.object(OpenClipVisualEncoder, "model_cfg", clip_model_cfg) + mocker.patch.object(OpenClipVisualEncoder, "preprocess_cfg", clip_preprocess_cfg) mocked = mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value mocked.run.return_value = [[self.embedding]] - mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True) - clip_encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir="test_cache", mode="vision") + clip_encoder = OpenClipVisualEncoder("ViT-B-32__openai", cache_dir="test_cache") embedding = clip_encoder.predict(pil_image) - assert clip_encoder.mode == "vision" assert isinstance(embedding, np.ndarray) assert embedding.shape[0] == clip_model_cfg["embed_dim"] assert embedding.dtype == np.float32 @@ -347,22 +351,19 @@ class TestCLIP: self, mocker: MockerFixture, clip_model_cfg: dict[str, Any], - clip_preprocess_cfg: Callable[[Path], dict[str, Any]], clip_tokenizer_cfg: Callable[[Path], dict[str, Any]], ) -> None: - mocker.patch.object(OpenCLIPEncoder, "download") - mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg) - mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg) - mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg) + mocker.patch.object(OpenClipTextualEncoder, "download") + mocker.patch.object(OpenClipTextualEncoder, "model_cfg", clip_model_cfg) + mocker.patch.object(OpenClipTextualEncoder, "tokenizer_cfg", clip_tokenizer_cfg) mocked = mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value mocked.run.return_value = [[self.embedding]] - mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True) + mocker.patch("app.models.clip.textual.Tokenizer.from_file", autospec=True) - clip_encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir="test_cache", mode="text") + clip_encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir="test_cache") embedding = clip_encoder.predict("test search query") - assert clip_encoder.mode == "text" assert isinstance(embedding, np.ndarray) assert embedding.shape[0] == clip_model_cfg["embed_dim"] assert embedding.dtype == np.float32 @@ -372,19 +373,18 @@ class TestCLIP: self, mocker: MockerFixture, clip_model_cfg: dict[str, Any], - clip_preprocess_cfg: Callable[[Path], dict[str, Any]], clip_tokenizer_cfg: Callable[[Path], dict[str, Any]], ) -> None: - mocker.patch.object(OpenCLIPEncoder, "download") - mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg) - mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg) - mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg) - mock_tokenizer = mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True).return_value + mocker.patch.object(OpenClipTextualEncoder, "download") + mocker.patch.object(OpenClipTextualEncoder, "model_cfg", clip_model_cfg) + mocker.patch.object(OpenClipTextualEncoder, "tokenizer_cfg", clip_tokenizer_cfg) + mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value + mock_tokenizer = mocker.patch("app.models.clip.textual.Tokenizer.from_file", autospec=True).return_value mock_ids = [randint(0, 50000) for _ in range(77)] mock_tokenizer.encode.return_value = SimpleNamespace(ids=mock_ids) - clip_encoder = OpenCLIPEncoder("ViT-B-32__openai", cache_dir="test_cache", mode="text") - clip_encoder._load_tokenizer() + clip_encoder = OpenClipTextualEncoder("ViT-B-32__openai", cache_dir="test_cache") + clip_encoder._load() tokens = clip_encoder.tokenize("test search query") assert "text" in tokens @@ -397,20 +397,19 @@ class TestCLIP: self, mocker: MockerFixture, clip_model_cfg: dict[str, Any], - clip_preprocess_cfg: Callable[[Path], dict[str, Any]], clip_tokenizer_cfg: Callable[[Path], dict[str, Any]], ) -> None: - mocker.patch.object(OpenCLIPEncoder, "download") - mocker.patch.object(OpenCLIPEncoder, "model_cfg", clip_model_cfg) - mocker.patch.object(OpenCLIPEncoder, "preprocess_cfg", clip_preprocess_cfg) - mocker.patch.object(OpenCLIPEncoder, "tokenizer_cfg", clip_tokenizer_cfg) - mock_tokenizer = mocker.patch("app.models.clip.Tokenizer.from_file", autospec=True).return_value + mocker.patch.object(MClipTextualEncoder, "download") + mocker.patch.object(MClipTextualEncoder, "model_cfg", clip_model_cfg) + mocker.patch.object(MClipTextualEncoder, "tokenizer_cfg", clip_tokenizer_cfg) + mocker.patch.object(InferenceModel, "_make_session", autospec=True).return_value + mock_tokenizer = mocker.patch("app.models.clip.textual.Tokenizer.from_file", autospec=True).return_value mock_ids = [randint(0, 50000) for _ in range(77)] mock_attention_mask = [randint(0, 1) for _ in range(77)] mock_tokenizer.encode.return_value = SimpleNamespace(ids=mock_ids, attention_mask=mock_attention_mask) - clip_encoder = MCLIPEncoder("ViT-B-32__openai", cache_dir="test_cache", mode="text") - clip_encoder._load_tokenizer() + clip_encoder = MClipTextualEncoder("ViT-B-32__openai", cache_dir="test_cache") + clip_encoder._load() tokens = clip_encoder.tokenize("test search query") assert "input_ids" in tokens @@ -430,59 +429,90 @@ class TestFaceRecognition: assert face_recognizer.min_score == 0.5 - def test_basic(self, cv_image: cv2.Mat, mocker: MockerFixture) -> None: - mocker.patch.object(FaceRecognizer, "load") - face_recognizer = FaceRecognizer("buffalo_s", min_score=0.0, cache_dir="test_cache") + def test_detection(self, cv_image: cv2.Mat, mocker: MockerFixture) -> None: + mocker.patch.object(FaceDetector, "load") + face_detector = FaceDetector("buffalo_s", min_score=0.0, cache_dir="test_cache") det_model = mock.Mock() num_faces = 2 bbox = np.random.rand(num_faces, 4).astype(np.float32) - score = np.array([[0.67]] * num_faces).astype(np.float32) + scores = np.array([[0.67]] * num_faces).astype(np.float32) kpss = np.random.rand(num_faces, 5, 2).astype(np.float32) - det_model.detect.return_value = (np.concatenate([bbox, score], axis=-1), kpss) - face_recognizer.det_model = det_model + det_model.detect.return_value = (np.concatenate([bbox, scores], axis=-1), kpss) + face_detector.model = det_model + + faces = face_detector.predict(cv_image) + + assert isinstance(faces, dict) + assert isinstance(faces.get("boxes", None), np.ndarray) + assert isinstance(faces.get("landmarks", None), np.ndarray) + assert isinstance(faces.get("scores", None), np.ndarray) + assert np.equal(faces["boxes"], bbox.round()).all() + assert np.equal(faces["landmarks"], kpss).all() + assert np.equal(faces["scores"], scores).all() + det_model.detect.assert_called_once() + + def test_recognition(self, cv_image: cv2.Mat, mocker: MockerFixture) -> None: + mocker.patch.object(FaceRecognizer, "load") + face_recognizer = FaceRecognizer("buffalo_s", min_score=0.0, cache_dir="test_cache") + + num_faces = 2 + bbox = np.random.rand(num_faces, 4).astype(np.float32) + scores = np.array([0.67] * num_faces).astype(np.float32) + kpss = np.random.rand(num_faces, 5, 2).astype(np.float32) + faces = {"boxes": bbox, "landmarks": kpss, "scores": scores} rec_model = mock.Mock() embedding = np.random.rand(num_faces, 512).astype(np.float32) rec_model.get_feat.return_value = embedding - face_recognizer.rec_model = rec_model + face_recognizer.model = rec_model - faces = face_recognizer.predict(cv_image) + faces = face_recognizer.predict(cv_image, faces) + assert isinstance(faces, list) assert len(faces) == num_faces for face in faces: - assert face["imageHeight"] == 800 - assert face["imageWidth"] == 600 - assert isinstance(face["embedding"], np.ndarray) + assert isinstance(face.get("boundingBox"), dict) + assert set(face["boundingBox"]) == {"x1", "y1", "x2", "y2"} + assert all(isinstance(val, np.float32) for val in face["boundingBox"].values()) + assert isinstance(face.get("embedding"), np.ndarray) assert face["embedding"].shape[0] == 512 - assert face["embedding"].dtype == np.float32 + assert isinstance(face.get("score", None), np.float32) - det_model.detect.assert_called_once() - assert rec_model.get_feat.call_count == num_faces + rec_model.get_feat.assert_called_once() + call_args = rec_model.get_feat.call_args_list[0].args + assert len(call_args) == 1 + assert isinstance(call_args[0], list) + assert isinstance(call_args[0][0], np.ndarray) + assert call_args[0][0].shape == (112, 112, 3) @pytest.mark.asyncio class TestCache: async def test_caches(self, mock_get_model: mock.Mock) -> None: model_cache = ModelCache() - await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION) - await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION) + await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION) + await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION) assert len(model_cache.cache._cache) == 1 mock_get_model.assert_called_once() async def test_kwargs_used(self, mock_get_model: mock.Mock) -> None: model_cache = ModelCache() - await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, cache_dir="test_cache") - mock_get_model.assert_called_once_with(ModelType.FACIAL_RECOGNITION, "test_model_name", cache_dir="test_cache") + await model_cache.get( + "test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, cache_dir="test_cache" + ) + mock_get_model.assert_called_once_with( + "test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, cache_dir="test_cache" + ) async def test_different_clip(self, mock_get_model: mock.Mock) -> None: model_cache = ModelCache() - await model_cache.get("test_image_model_name", ModelType.CLIP) - await model_cache.get("test_text_model_name", ModelType.CLIP) + await model_cache.get("test_model_name", ModelType.VISUAL, ModelTask.SEARCH) + await model_cache.get("test_model_name", ModelType.TEXTUAL, ModelTask.SEARCH) mock_get_model.assert_has_calls( [ - mock.call(ModelType.CLIP, "test_image_model_name"), - mock.call(ModelType.CLIP, "test_text_model_name"), + mock.call("test_model_name", ModelType.VISUAL, ModelTask.SEARCH), + mock.call("test_model_name", ModelType.TEXTUAL, ModelTask.SEARCH), ] ) assert len(model_cache.cache._cache) == 2 @@ -490,19 +520,19 @@ class TestCache: @mock.patch("app.models.cache.OptimisticLock", autospec=True) async def test_model_ttl(self, mock_lock_cls: mock.Mock, mock_get_model: mock.Mock) -> None: model_cache = ModelCache() - await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, ttl=100) + await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, ttl=100) mock_lock_cls.return_value.__aenter__.return_value.cas.assert_called_with(mock.ANY, ttl=100) @mock.patch("app.models.cache.SimpleMemoryCache.expire") async def test_revalidate_get(self, mock_cache_expire: mock.Mock, mock_get_model: mock.Mock) -> None: model_cache = ModelCache(revalidate=True) - await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, ttl=100) - await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, ttl=100) + await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, ttl=100) + await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, ttl=100) mock_cache_expire.assert_called_once_with(mock.ANY, 100) async def test_profiling(self, mock_get_model: mock.Mock) -> None: model_cache = ModelCache(profiling=True) - await model_cache.get("test_model_name", ModelType.FACIAL_RECOGNITION, ttl=100) + await model_cache.get("test_model_name", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION, ttl=100) profiling = await model_cache.get_profiling() assert isinstance(profiling, dict) assert profiling == model_cache.cache.profiling @@ -510,9 +540,9 @@ class TestCache: async def test_loads_mclip(self) -> None: model_cache = ModelCache() - model = await model_cache.get("XLM-Roberta-Large-Vit-B-32", ModelType.CLIP, mode="text") + model = await model_cache.get("XLM-Roberta-Large-Vit-B-32", ModelType.TEXTUAL, ModelTask.SEARCH) - assert isinstance(model, MCLIPEncoder) + assert isinstance(model, MClipTextualEncoder) assert model.model_name == "XLM-Roberta-Large-Vit-B-32" async def test_raises_exception_if_invalid_model_type(self) -> None: @@ -520,15 +550,55 @@ class TestCache: model_cache = ModelCache() with pytest.raises(ValueError): - await model_cache.get("XLM-Roberta-Large-Vit-B-32", invalid, mode="text") + await model_cache.get("XLM-Roberta-Large-Vit-B-32", ModelType.TEXTUAL, invalid) async def test_raises_exception_if_unknown_model_name(self) -> None: model_cache = ModelCache() with pytest.raises(ValueError): - await model_cache.get("test_model_name", ModelType.CLIP, mode="text") + await model_cache.get("test_model_name", ModelType.TEXTUAL, ModelTask.SEARCH) - async def test_preloads_models(self, monkeypatch: MonkeyPatch, mock_get_model: mock.Mock) -> None: + async def test_preloads_clip_models(self, monkeypatch: MonkeyPatch, mock_get_model: mock.Mock) -> None: + os.environ["MACHINE_LEARNING_PRELOAD__CLIP"] = "ViT-B-32__openai" + + settings = Settings() + assert settings.preload is not None + assert settings.preload.clip == "ViT-B-32__openai" + + model_cache = ModelCache() + monkeypatch.setattr("app.main.model_cache", model_cache) + + await preload_models(settings.preload) + mock_get_model.assert_has_calls( + [ + mock.call("ViT-B-32__openai", ModelType.TEXTUAL, ModelTask.SEARCH), + mock.call("ViT-B-32__openai", ModelType.VISUAL, ModelTask.SEARCH), + ], + any_order=True, + ) + + async def test_preloads_facial_recognition_models( + self, monkeypatch: MonkeyPatch, mock_get_model: mock.Mock + ) -> None: + os.environ["MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION"] = "buffalo_s" + + settings = Settings() + assert settings.preload is not None + assert settings.preload.facial_recognition == "buffalo_s" + + model_cache = ModelCache() + monkeypatch.setattr("app.main.model_cache", model_cache) + + await preload_models(settings.preload) + mock_get_model.assert_has_calls( + [ + mock.call("buffalo_s", ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION), + mock.call("buffalo_s", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION), + ], + any_order=True, + ) + + async def test_preloads_all_models(self, monkeypatch: MonkeyPatch, mock_get_model: mock.Mock) -> None: os.environ["MACHINE_LEARNING_PRELOAD__CLIP"] = "ViT-B-32__openai" os.environ["MACHINE_LEARNING_PRELOAD__FACIAL_RECOGNITION"] = "buffalo_s" @@ -541,11 +611,15 @@ class TestCache: monkeypatch.setattr("app.main.model_cache", model_cache) await preload_models(settings.preload) - assert len(model_cache.cache._cache) == 2 - assert mock_get_model.call_count == 2 - await model_cache.get("ViT-B-32__openai", ModelType.CLIP, ttl=100) - await model_cache.get("buffalo_s", ModelType.FACIAL_RECOGNITION, ttl=100) - assert mock_get_model.call_count == 2 + mock_get_model.assert_has_calls( + [ + mock.call("ViT-B-32__openai", ModelType.TEXTUAL, ModelTask.SEARCH), + mock.call("ViT-B-32__openai", ModelType.VISUAL, ModelTask.SEARCH), + mock.call("buffalo_s", ModelType.DETECTION, ModelTask.FACIAL_RECOGNITION), + mock.call("buffalo_s", ModelType.RECOGNITION, ModelTask.FACIAL_RECOGNITION), + ], + any_order=True, + ) @pytest.mark.asyncio @@ -572,7 +646,8 @@ class TestLoad: async def test_load_clears_cache_and_retries_if_os_error(self) -> None: mock_model = mock.Mock(spec=InferenceModel) mock_model.model_name = "test_model_name" - mock_model.model_type = ModelType.CLIP + mock_model.model_type = ModelType.VISUAL + mock_model.model_task = ModelTask.SEARCH mock_model.load.side_effect = [OSError, None] mock_model.loaded = False @@ -597,13 +672,15 @@ class TestEndpoints: response = deployed_app.post( "http://localhost:3003/predict", - data={"modelName": "ViT-B-32__openai", "modelType": "clip", "options": json.dumps({"mode": "vision"})}, + data={"entries": json.dumps({"clip": {"visual": {"modelName": "ViT-B-32__openai"}}})}, files={"image": byte_image.getvalue()}, ) actual = response.json() assert response.status_code == 200 - assert np.allclose(expected, actual) + assert isinstance(actual, dict) + assert isinstance(actual.get("clip", None), list) + assert np.allclose(expected, actual["clip"]) def test_clip_text_endpoint(self, responses: dict[str, Any], deployed_app: TestClient) -> None: expected = responses["clip"]["text"] @@ -611,38 +688,49 @@ class TestEndpoints: response = deployed_app.post( "http://localhost:3003/predict", data={ - "modelName": "ViT-B-32__openai", - "modelType": "clip", + "entries": json.dumps( + { + "clip": {"textual": {"modelName": "ViT-B-32__openai"}}, + }, + ), "text": "test search query", - "options": json.dumps({"mode": "text"}), }, ) actual = response.json() assert response.status_code == 200 - assert np.allclose(expected, actual) + assert isinstance(actual, dict) + assert isinstance(actual.get("clip", None), list) + assert np.allclose(expected, actual["clip"]) def test_face_endpoint(self, pil_image: Image.Image, responses: dict[str, Any], deployed_app: TestClient) -> None: byte_image = BytesIO() pil_image.save(byte_image, format="jpeg") - expected = responses["facial-recognition"] response = deployed_app.post( "http://localhost:3003/predict", data={ - "modelName": "buffalo_l", - "modelType": "facial-recognition", - "options": json.dumps({"minScore": 0.034}), + "entries": json.dumps( + { + "facial-recognition": { + "detection": {"modelName": "buffalo_l", "options": {"minScore": 0.034}}, + "recognition": {"modelName": "buffalo_l"}, + } + } + ) }, files={"image": byte_image.getvalue()}, ) actual = response.json() assert response.status_code == 200 - assert len(expected) == len(actual) - for expected_face, actual_face in zip(expected, actual): - assert expected_face["imageHeight"] == actual_face["imageHeight"] - assert expected_face["imageWidth"] == actual_face["imageWidth"] + assert isinstance(actual, dict) + assert actual.get("imageHeight", None) == responses["imageHeight"] + assert actual.get("imageWidth", None) == responses["imageWidth"] + assert "facial-recognition" in actual and isinstance(actual["facial-recognition"], list) + assert len(actual["facial-recognition"]) == len(responses["facial-recognition"]) + + for expected_face, actual_face in zip(responses["facial-recognition"], actual["facial-recognition"]): assert expected_face["boundingBox"] == actual_face["boundingBox"] assert np.allclose(expected_face["embedding"], actual_face["embedding"]) assert np.allclose(expected_face["score"], actual_face["score"]) diff --git a/machine-learning/locustfile.py b/machine-learning/locustfile.py index d32bc17d55..81087bee8c 100644 --- a/machine-learning/locustfile.py +++ b/machine-learning/locustfile.py @@ -37,7 +37,6 @@ def on_test_start(environment: Environment, **kwargs: Any) -> None: global byte_image assert environment.parsed_options is not None image = Image.new("RGB", (environment.parsed_options.image_size, environment.parsed_options.image_size)) - byte_image = BytesIO() image.save(byte_image, format="jpeg") @@ -45,34 +44,25 @@ class InferenceLoadTest(HttpUser): abstract: bool = True host = "http://127.0.0.1:3003" data: bytes - headers: dict[str, str] = {"Content-Type": "image/jpg"} # re-use the image across all instances in a process def on_start(self) -> None: - global byte_image self.data = byte_image.getvalue() class CLIPTextFormDataLoadTest(InferenceLoadTest): @task def encode_text(self) -> None: - data = [ - ("modelName", self.environment.parsed_options.clip_model), - ("modelType", "clip"), - ("options", json.dumps({"mode": "text"})), - ("text", "test search query"), - ] + request = {"clip": {"textual": {"modelName": self.environment.parsed_options.clip_model}}} + data = [("entries", json.dumps(request)), ("text", "test search query")] self.client.post("/predict", data=data) class CLIPVisionFormDataLoadTest(InferenceLoadTest): @task def encode_image(self) -> None: - data = [ - ("modelName", self.environment.parsed_options.clip_model), - ("modelType", "clip"), - ("options", json.dumps({"mode": "vision"})), - ] + request = {"clip": {"visual": {"modelName": self.environment.parsed_options.clip_model, "options": {}}}} + data = [("entries", json.dumps(request))] files = {"image": self.data} self.client.post("/predict", data=data, files=files) @@ -80,11 +70,18 @@ class CLIPVisionFormDataLoadTest(InferenceLoadTest): class RecognitionFormDataLoadTest(InferenceLoadTest): @task def recognize(self) -> None: - data = [ - ("modelName", self.environment.parsed_options.face_model), - ("modelType", "facial-recognition"), - ("options", json.dumps({"minScore": self.environment.parsed_options.face_min_score})), - ] + request = { + "facial-recognition": { + "recognition": { + "modelName": self.environment.parsed_options.face_model, + "options": {"minScore": self.environment.parsed_options.face_min_score}, + }, + "detection": { + "modelName": self.environment.parsed_options.face_model, + }, + } + } + data = [("entries", json.dumps(request))] files = {"image": self.data} self.client.post("/predict", data=data, files=files) diff --git a/machine-learning/responses.json b/machine-learning/responses.json index 209f05cffb..8fe9cd61cc 100644 --- a/machine-learning/responses.json +++ b/machine-learning/responses.json @@ -213,8 +213,6 @@ }, "facial-recognition": [ { - "imageWidth": 600, - "imageHeight": 800, "boundingBox": { "x1": 690.0, "y1": -89.0, @@ -325,5 +323,7 @@ -0.077056274, 0.002099529 ] } - ] + ], + "imageWidth": 600, + "imageHeight": 800 } diff --git a/mobile/openapi/README.md b/mobile/openapi/README.md index 93ea6a551c..914d291e1e 100644 Binary files a/mobile/openapi/README.md and b/mobile/openapi/README.md differ diff --git a/mobile/openapi/lib/api.dart b/mobile/openapi/lib/api.dart index 17602fafaf..4d7a4a533d 100644 Binary files a/mobile/openapi/lib/api.dart and b/mobile/openapi/lib/api.dart differ diff --git a/mobile/openapi/lib/api_client.dart b/mobile/openapi/lib/api_client.dart index 94187719a8..dbfec53b93 100644 Binary files a/mobile/openapi/lib/api_client.dart and b/mobile/openapi/lib/api_client.dart differ diff --git a/mobile/openapi/lib/api_helper.dart b/mobile/openapi/lib/api_helper.dart index 6f0acc0976..762b728869 100644 Binary files a/mobile/openapi/lib/api_helper.dart and b/mobile/openapi/lib/api_helper.dart differ diff --git a/mobile/openapi/lib/model/clip_config.dart b/mobile/openapi/lib/model/clip_config.dart index 5a1d429aea..6e95c15fbf 100644 Binary files a/mobile/openapi/lib/model/clip_config.dart and b/mobile/openapi/lib/model/clip_config.dart differ diff --git a/mobile/openapi/lib/model/clip_mode.dart b/mobile/openapi/lib/model/clip_mode.dart deleted file mode 100644 index 98a38759da..0000000000 Binary files a/mobile/openapi/lib/model/clip_mode.dart and /dev/null differ diff --git a/mobile/openapi/lib/model/recognition_config.dart b/mobile/openapi/lib/model/facial_recognition_config.dart similarity index 59% rename from mobile/openapi/lib/model/recognition_config.dart rename to mobile/openapi/lib/model/facial_recognition_config.dart index 3080023bbd..8a3a5ab9c2 100644 Binary files a/mobile/openapi/lib/model/recognition_config.dart and b/mobile/openapi/lib/model/facial_recognition_config.dart differ diff --git a/mobile/openapi/lib/model/model_type.dart b/mobile/openapi/lib/model/model_type.dart deleted file mode 100644 index 36f7424199..0000000000 Binary files a/mobile/openapi/lib/model/model_type.dart and /dev/null differ diff --git a/mobile/openapi/lib/model/system_config_machine_learning_dto.dart b/mobile/openapi/lib/model/system_config_machine_learning_dto.dart index fc3aae1804..3923bacad4 100644 Binary files a/mobile/openapi/lib/model/system_config_machine_learning_dto.dart and b/mobile/openapi/lib/model/system_config_machine_learning_dto.dart differ diff --git a/open-api/immich-openapi-specs.json b/open-api/immich-openapi-specs.json index 94feb6cfcf..c53f30527c 100644 --- a/open-api/immich-openapi-specs.json +++ b/open-api/immich-openapi-specs.json @@ -7878,14 +7878,8 @@ "enabled": { "type": "boolean" }, - "mode": { - "$ref": "#/components/schemas/CLIPMode" - }, "modelName": { "type": "string" - }, - "modelType": { - "$ref": "#/components/schemas/ModelType" } }, "required": [ @@ -7894,13 +7888,6 @@ ], "type": "object" }, - "CLIPMode": { - "enum": [ - "vision", - "text" - ], - "type": "string" - }, "CQMode": { "enum": [ "auto", @@ -8323,6 +8310,40 @@ ], "type": "object" }, + "FacialRecognitionConfig": { + "properties": { + "enabled": { + "type": "boolean" + }, + "maxDistance": { + "format": "float", + "maximum": 2, + "minimum": 0, + "type": "number" + }, + "minFaces": { + "minimum": 1, + "type": "integer" + }, + "minScore": { + "format": "float", + "maximum": 1, + "minimum": 0, + "type": "number" + }, + "modelName": { + "type": "string" + } + }, + "required": [ + "enabled", + "maxDistance", + "minFaces", + "minScore", + "modelName" + ], + "type": "object" + }, "FileChecksumDto": { "properties": { "filenames": { @@ -9039,13 +9060,6 @@ }, "type": "object" }, - "ModelType": { - "enum": [ - "facial-recognition", - "clip" - ], - "type": "string" - }, "OAuthAuthorizeResponseDto": { "properties": { "url": { @@ -9379,43 +9393,6 @@ ], "type": "string" }, - "RecognitionConfig": { - "properties": { - "enabled": { - "type": "boolean" - }, - "maxDistance": { - "format": "float", - "maximum": 2, - "minimum": 0, - "type": "number" - }, - "minFaces": { - "minimum": 1, - "type": "integer" - }, - "minScore": { - "format": "float", - "maximum": 1, - "minimum": 0, - "type": "number" - }, - "modelName": { - "type": "string" - }, - "modelType": { - "$ref": "#/components/schemas/ModelType" - } - }, - "required": [ - "enabled", - "maxDistance", - "minFaces", - "minScore", - "modelName" - ], - "type": "object" - }, "ReverseGeocodingStateResponseDto": { "properties": { "lastImportFileName": { @@ -10521,7 +10498,7 @@ "type": "boolean" }, "facialRecognition": { - "$ref": "#/components/schemas/RecognitionConfig" + "$ref": "#/components/schemas/FacialRecognitionConfig" }, "url": { "type": "string" diff --git a/open-api/typescript-sdk/src/fetch-client.ts b/open-api/typescript-sdk/src/fetch-client.ts index c604de9f8a..d294110be1 100644 --- a/open-api/typescript-sdk/src/fetch-client.ts +++ b/open-api/typescript-sdk/src/fetch-client.ts @@ -962,27 +962,24 @@ export type SystemConfigLoggingDto = { }; export type ClipConfig = { enabled: boolean; - mode?: CLIPMode; modelName: string; - modelType?: ModelType; }; export type DuplicateDetectionConfig = { enabled: boolean; maxDistance: number; }; -export type RecognitionConfig = { +export type FacialRecognitionConfig = { enabled: boolean; maxDistance: number; minFaces: number; minScore: number; modelName: string; - modelType?: ModelType; }; export type SystemConfigMachineLearningDto = { clip: ClipConfig; duplicateDetection: DuplicateDetectionConfig; enabled: boolean; - facialRecognition: RecognitionConfig; + facialRecognition: FacialRecognitionConfig; url: string; }; export type SystemConfigMapDto = { @@ -3074,14 +3071,6 @@ export enum LogLevel { Error = "error", Fatal = "fatal" } -export enum CLIPMode { - Vision = "vision", - Text = "text" -} -export enum ModelType { - FacialRecognition = "facial-recognition", - Clip = "clip" -} export enum TimeBucketSize { Day = "DAY", Month = "MONTH" diff --git a/server/src/dtos/model-config.dto.ts b/server/src/dtos/model-config.dto.ts index a3efd19f82..5dc8237414 100644 --- a/server/src/dtos/model-config.dto.ts +++ b/server/src/dtos/model-config.dto.ts @@ -1,8 +1,7 @@ import { ApiProperty } from '@nestjs/swagger'; import { Type } from 'class-transformer'; -import { IsEnum, IsNotEmpty, IsNumber, IsString, Max, Min } from 'class-validator'; -import { CLIPMode, ModelType } from 'src/interfaces/machine-learning.interface'; -import { Optional, ValidateBoolean } from 'src/validation'; +import { IsNotEmpty, IsNumber, IsString, Max, Min } from 'class-validator'; +import { ValidateBoolean } from 'src/validation'; export class TaskConfig { @ValidateBoolean() @@ -13,19 +12,9 @@ export class ModelConfig extends TaskConfig { @IsString() @IsNotEmpty() modelName!: string; - - @IsEnum(ModelType) - @Optional() - @ApiProperty({ enumName: 'ModelType', enum: ModelType }) - modelType?: ModelType; } -export class CLIPConfig extends ModelConfig { - @IsEnum(CLIPMode) - @Optional() - @ApiProperty({ enumName: 'CLIPMode', enum: CLIPMode }) - mode?: CLIPMode; -} +export class CLIPConfig extends ModelConfig {} export class DuplicateDetectionConfig extends TaskConfig { @IsNumber() @@ -36,7 +25,7 @@ export class DuplicateDetectionConfig extends TaskConfig { maxDistance!: number; } -export class RecognitionConfig extends ModelConfig { +export class FacialRecognitionConfig extends ModelConfig { @IsNumber() @Min(0) @Max(1) diff --git a/server/src/dtos/system-config.dto.ts b/server/src/dtos/system-config.dto.ts index 4dcea6ec3d..b593c0edbf 100644 --- a/server/src/dtos/system-config.dto.ts +++ b/server/src/dtos/system-config.dto.ts @@ -30,7 +30,7 @@ import { TranscodePolicy, VideoCodec, } from 'src/config'; -import { CLIPConfig, DuplicateDetectionConfig, RecognitionConfig } from 'src/dtos/model-config.dto'; +import { CLIPConfig, DuplicateDetectionConfig, FacialRecognitionConfig } from 'src/dtos/model-config.dto'; import { ConcurrentQueueName, QueueName } from 'src/interfaces/job.interface'; import { ValidateBoolean, validateCronExpression } from 'src/validation'; @@ -270,10 +270,10 @@ class SystemConfigMachineLearningDto { @IsObject() duplicateDetection!: DuplicateDetectionConfig; - @Type(() => RecognitionConfig) + @Type(() => FacialRecognitionConfig) @ValidateNested() @IsObject() - facialRecognition!: RecognitionConfig; + facialRecognition!: FacialRecognitionConfig; } enum MapTheme { diff --git a/server/src/interfaces/machine-learning.interface.ts b/server/src/interfaces/machine-learning.interface.ts index 0aeed7635a..143281c23a 100644 --- a/server/src/interfaces/machine-learning.interface.ts +++ b/server/src/interfaces/machine-learning.interface.ts @@ -1,15 +1,5 @@ -import { CLIPConfig, RecognitionConfig } from 'src/dtos/model-config.dto'; - export const IMachineLearningRepository = 'IMachineLearningRepository'; -export interface VisionModelInput { - imagePath: string; -} - -export interface TextModelInput { - text: string; -} - export interface BoundingBox { x1: number; y1: number; @@ -17,26 +7,51 @@ export interface BoundingBox { y2: number; } -export interface DetectFaceResult { - imageWidth: number; - imageHeight: number; - boundingBox: BoundingBox; - score: number; - embedding: number[]; +export enum ModelTask { + FACIAL_RECOGNITION = 'facial-recognition', + SEARCH = 'clip', } export enum ModelType { - FACIAL_RECOGNITION = 'facial-recognition', - CLIP = 'clip', + DETECTION = 'detection', + PIPELINE = 'pipeline', + RECOGNITION = 'recognition', + TEXTUAL = 'textual', + VISUAL = 'visual', } -export enum CLIPMode { - VISION = 'vision', - TEXT = 'text', +export type ModelPayload = { imagePath: string } | { text: string }; + +type ModelOptions = { modelName: string }; + +export type FaceDetectionOptions = ModelOptions & { minScore: number }; + +type VisualResponse = { imageHeight: number; imageWidth: number }; +export type ClipVisualRequest = { [ModelTask.SEARCH]: { [ModelType.VISUAL]: ModelOptions } }; +export type ClipVisualResponse = { [ModelTask.SEARCH]: number[] } & VisualResponse; + +export type ClipTextualRequest = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: ModelOptions } }; +export type ClipTextualResponse = { [ModelTask.SEARCH]: number[] }; + +export type FacialRecognitionRequest = { + [ModelTask.FACIAL_RECOGNITION]: { + [ModelType.DETECTION]: FaceDetectionOptions; + [ModelType.RECOGNITION]: ModelOptions; + }; +}; + +export interface Face { + boundingBox: BoundingBox; + embedding: number[]; + score: number; } +export type FacialRecognitionResponse = { [ModelTask.FACIAL_RECOGNITION]: Face[] } & VisualResponse; +export type DetectedFaces = { faces: Face[] } & VisualResponse; +export type MachineLearningRequest = ClipVisualRequest | ClipTextualRequest | FacialRecognitionRequest; + export interface IMachineLearningRepository { - encodeImage(url: string, input: VisionModelInput, config: CLIPConfig): Promise; - encodeText(url: string, input: TextModelInput, config: CLIPConfig): Promise; - detectFaces(url: string, input: VisionModelInput, config: RecognitionConfig): Promise; + encodeImage(url: string, imagePath: string, config: ModelOptions): Promise; + encodeText(url: string, text: string, config: ModelOptions): Promise; + detectFaces(url: string, imagePath: string, config: FaceDetectionOptions): Promise; } diff --git a/server/src/interfaces/search.interface.ts b/server/src/interfaces/search.interface.ts index ce9e2a1940..d5382a04fa 100644 --- a/server/src/interfaces/search.interface.ts +++ b/server/src/interfaces/search.interface.ts @@ -37,8 +37,6 @@ export interface SearchExploreItem { items: SearchExploreItemSet; } -export type Embedding = number[]; - export interface SearchAssetIDOptions { checksum?: Buffer; deviceAssetId?: string; @@ -106,7 +104,7 @@ export interface SearchExifOptions { } export interface SearchEmbeddingOptions { - embedding: Embedding; + embedding: number[]; userIds: string[]; } @@ -154,7 +152,7 @@ export interface FaceEmbeddingSearch extends SearchEmbeddingOptions { export interface AssetDuplicateSearch { assetId: string; - embedding: Embedding; + embedding: number[]; maxDistance?: number; type: AssetType; userIds: string[]; diff --git a/server/src/repositories/machine-learning.repository.ts b/server/src/repositories/machine-learning.repository.ts index bff22b9507..405e5a421d 100644 --- a/server/src/repositories/machine-learning.repository.ts +++ b/server/src/repositories/machine-learning.repository.ts @@ -1,13 +1,16 @@ import { Injectable } from '@nestjs/common'; import { readFile } from 'node:fs/promises'; -import { CLIPConfig, ModelConfig, RecognitionConfig } from 'src/dtos/model-config.dto'; +import { CLIPConfig } from 'src/dtos/model-config.dto'; import { - CLIPMode, - DetectFaceResult, + ClipTextualResponse, + ClipVisualResponse, + FaceDetectionOptions, + FacialRecognitionResponse, IMachineLearningRepository, + MachineLearningRequest, + ModelPayload, + ModelTask, ModelType, - TextModelInput, - VisionModelInput, } from 'src/interfaces/machine-learning.interface'; import { Instrumentation } from 'src/utils/instrumentation'; @@ -16,8 +19,8 @@ const errorPrefix = 'Machine learning request'; @Instrumentation() @Injectable() export class MachineLearningRepository implements IMachineLearningRepository { - private async predict(url: string, input: TextModelInput | VisionModelInput, config: ModelConfig): Promise { - const formData = await this.getFormData(input, config); + private async predict(url: string, payload: ModelPayload, config: MachineLearningRequest): Promise { + const formData = await this.getFormData(payload, config); const res = await fetch(new URL('/predict', url), { method: 'POST', body: formData }).catch( (error: Error | any) => { @@ -26,50 +29,46 @@ export class MachineLearningRepository implements IMachineLearningRepository { ); if (res.status >= 400) { - const modelType = config.modelType ? ` for ${config.modelType.replace('-', ' ')}` : ''; - throw new Error(`${errorPrefix}${modelType} failed with status ${res.status}: ${res.statusText}`); + throw new Error(`${errorPrefix} '${JSON.stringify(config)}' failed with status ${res.status}: ${res.statusText}`); } return res.json(); } - detectFaces(url: string, input: VisionModelInput, config: RecognitionConfig): Promise { - return this.predict(url, input, { ...config, modelType: ModelType.FACIAL_RECOGNITION }); + async detectFaces(url: string, imagePath: string, { modelName, minScore }: FaceDetectionOptions) { + const request = { + [ModelTask.FACIAL_RECOGNITION]: { + [ModelType.DETECTION]: { modelName, minScore }, + [ModelType.RECOGNITION]: { modelName }, + }, + }; + const response = await this.predict(url, { imagePath }, request); + return { + imageHeight: response.imageHeight, + imageWidth: response.imageWidth, + faces: response[ModelTask.FACIAL_RECOGNITION], + }; } - encodeImage(url: string, input: VisionModelInput, config: CLIPConfig): Promise { - return this.predict(url, input, { - ...config, - modelType: ModelType.CLIP, - mode: CLIPMode.VISION, - } as CLIPConfig); + async encodeImage(url: string, imagePath: string, { modelName }: CLIPConfig) { + const request = { [ModelTask.SEARCH]: { [ModelType.VISUAL]: { modelName } } }; + const response = await this.predict(url, { imagePath }, request); + return response[ModelTask.SEARCH]; } - encodeText(url: string, input: TextModelInput, config: CLIPConfig): Promise { - return this.predict(url, input, { - ...config, - modelType: ModelType.CLIP, - mode: CLIPMode.TEXT, - } as CLIPConfig); + async encodeText(url: string, text: string, { modelName }: CLIPConfig) { + const request = { [ModelTask.SEARCH]: { [ModelType.TEXTUAL]: { modelName } } }; + const response = await this.predict(url, { text }, request); + return response[ModelTask.SEARCH]; } - private async getFormData(input: TextModelInput | VisionModelInput, config: ModelConfig): Promise { + private async getFormData(payload: ModelPayload, config: MachineLearningRequest): Promise { const formData = new FormData(); - const { enabled, modelName, modelType, ...options } = config; - if (!enabled) { - throw new Error(`${modelType} is not enabled`); - } + formData.append('entries', JSON.stringify(config)); - formData.append('modelName', modelName); - if (modelType) { - formData.append('modelType', modelType); - } - if (options) { - formData.append('options', JSON.stringify(options)); - } - if ('imagePath' in input) { - formData.append('image', new Blob([await readFile(input.imagePath)])); - } else if ('text' in input) { - formData.append('text', input.text); + if ('imagePath' in payload) { + formData.append('image', new Blob([await readFile(payload.imagePath)])); + } else if ('text' in payload) { + formData.append('text', payload.text); } else { throw new Error('Invalid input'); } diff --git a/server/src/services/person.service.spec.ts b/server/src/services/person.service.spec.ts index 1644c0c896..56447c8d20 100644 --- a/server/src/services/person.service.spec.ts +++ b/server/src/services/person.service.spec.ts @@ -7,7 +7,7 @@ import { IAssetRepository, WithoutProperty } from 'src/interfaces/asset.interfac import { ICryptoRepository } from 'src/interfaces/crypto.interface'; import { IJobRepository, JobName, JobStatus } from 'src/interfaces/job.interface'; import { ILoggerRepository } from 'src/interfaces/logger.interface'; -import { IMachineLearningRepository } from 'src/interfaces/machine-learning.interface'; +import { DetectedFaces, IMachineLearningRepository } from 'src/interfaces/machine-learning.interface'; import { IMediaRepository } from 'src/interfaces/media.interface'; import { IMoveRepository } from 'src/interfaces/move.interface'; import { IPersonRepository } from 'src/interfaces/person.interface'; @@ -46,19 +46,21 @@ const responseDto: PersonResponseDto = { const statistics = { assets: 3 }; -const detectFaceMock = { - assetId: 'asset-1', - personId: 'person-1', - boundingBox: { - x1: 100, - y1: 100, - x2: 200, - y2: 200, - }, +const detectFaceMock: DetectedFaces = { + faces: [ + { + boundingBox: { + x1: 100, + y1: 100, + x2: 200, + y2: 200, + }, + embedding: [1, 2, 3, 4], + score: 0.2, + }, + ], imageHeight: 500, imageWidth: 400, - embedding: [1, 2, 3, 4], - score: 0.2, }; describe(PersonService.name, () => { @@ -642,21 +644,13 @@ describe(PersonService.name, () => { it('should handle no results', async () => { const start = Date.now(); - machineLearningMock.detectFaces.mockResolvedValue([]); + machineLearningMock.detectFaces.mockResolvedValue({ imageHeight: 500, imageWidth: 400, faces: [] }); assetMock.getByIds.mockResolvedValue([assetStub.image]); await sut.handleDetectFaces({ id: assetStub.image.id }); expect(machineLearningMock.detectFaces).toHaveBeenCalledWith( 'http://immich-machine-learning:3003', - { - imagePath: assetStub.image.previewPath, - }, - { - enabled: true, - maxDistance: 0.5, - minScore: 0.7, - minFaces: 3, - modelName: 'buffalo_l', - }, + assetStub.image.previewPath, + expect.objectContaining({ minScore: 0.7, modelName: 'buffalo_l' }), ); expect(personMock.createFaces).not.toHaveBeenCalled(); expect(jobMock.queue).not.toHaveBeenCalled(); @@ -671,7 +665,7 @@ describe(PersonService.name, () => { it('should create a face with no person and queue recognition job', async () => { personMock.createFaces.mockResolvedValue([faceStub.face1.id]); - machineLearningMock.detectFaces.mockResolvedValue([detectFaceMock]); + machineLearningMock.detectFaces.mockResolvedValue(detectFaceMock); searchMock.searchFaces.mockResolvedValue([{ face: faceStub.face1, distance: 0.7 }]); assetMock.getByIds.mockResolvedValue([assetStub.image]); const face = { diff --git a/server/src/services/person.service.ts b/server/src/services/person.service.ts index de0c191667..faa65974d4 100644 --- a/server/src/services/person.service.ts +++ b/server/src/services/person.service.ts @@ -333,26 +333,28 @@ export class PersonService { return JobStatus.SKIPPED; } - const faces = await this.machineLearningRepository.detectFaces( + if (!asset.isVisible) { + return JobStatus.SKIPPED; + } + + const { imageHeight, imageWidth, faces } = await this.machineLearningRepository.detectFaces( machineLearning.url, - { imagePath: asset.previewPath }, + asset.previewPath, machineLearning.facialRecognition, ); this.logger.debug(`${faces.length} faces detected in ${asset.previewPath}`); - this.logger.verbose(faces.map((face) => ({ ...face, embedding: `vector(${face.embedding.length})` }))); if (faces.length > 0) { await this.jobRepository.queue({ name: JobName.QUEUE_FACIAL_RECOGNITION, data: { force: false } }); - const mappedFaces = faces.map((face) => ({ assetId: asset.id, embedding: face.embedding, - imageHeight: face.imageHeight, - imageWidth: face.imageWidth, + imageHeight, + imageWidth, boundingBoxX1: face.boundingBox.x1, - boundingBoxX2: face.boundingBox.x2, boundingBoxY1: face.boundingBox.y1, + boundingBoxX2: face.boundingBox.x2, boundingBoxY2: face.boundingBox.y2, })); diff --git a/server/src/services/search.service.ts b/server/src/services/search.service.ts index 8c89218138..9213cc4290 100644 --- a/server/src/services/search.service.ts +++ b/server/src/services/search.service.ts @@ -102,12 +102,7 @@ export class SearchService { const userIds = await this.getUserIdsToSearch(auth); - const embedding = await this.machineLearning.encodeText( - machineLearning.url, - { text: dto.query }, - machineLearning.clip, - ); - + const embedding = await this.machineLearning.encodeText(machineLearning.url, dto.query, machineLearning.clip); const page = dto.page ?? 1; const size = dto.size || 100; const { hasNextPage, items } = await this.searchRepository.searchSmart( diff --git a/server/src/services/smart-info.service.spec.ts b/server/src/services/smart-info.service.spec.ts index 7ac6dac414..95f76edc49 100644 --- a/server/src/services/smart-info.service.spec.ts +++ b/server/src/services/smart-info.service.spec.ts @@ -108,8 +108,8 @@ describe(SmartInfoService.name, () => { expect(machineMock.encodeImage).toHaveBeenCalledWith( 'http://immich-machine-learning:3003', - { imagePath: assetStub.image.previewPath }, - { enabled: true, modelName: 'ViT-B-32__openai' }, + assetStub.image.previewPath, + expect.objectContaining({ modelName: 'ViT-B-32__openai' }), ); expect(searchMock.upsert).toHaveBeenCalledWith(assetStub.image.id, [0.01, 0.02, 0.03]); }); diff --git a/server/src/services/smart-info.service.ts b/server/src/services/smart-info.service.ts index f902aa7e57..46a57c3cd0 100644 --- a/server/src/services/smart-info.service.ts +++ b/server/src/services/smart-info.service.ts @@ -93,9 +93,9 @@ export class SmartInfoService { return JobStatus.FAILED; } - const clipEmbedding = await this.machineLearning.encodeImage( + const embedding = await this.machineLearning.encodeImage( machineLearning.url, - { imagePath: asset.previewPath }, + asset.previewPath, machineLearning.clip, ); @@ -104,7 +104,7 @@ export class SmartInfoService { await this.databaseRepository.wait(DatabaseLock.CLIPDimSize); } - await this.repository.upsert(asset.id, clipEmbedding); + await this.repository.upsert(asset.id, embedding); return JobStatus.SUCCESS; }