diff --git a/machine-learning/ann/build.sh b/machine-learning/ann/build.sh deleted file mode 100644 index 219c0ef1b1..0000000000 --- a/machine-learning/ann/build.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/usr/bin/env sh - -g++ -shared -O3 -o libann.so -fuse-ld=gold -std=c++17 -I"$ARMNN_PATH"/include -larmnn -larmnnDeserializer -larmnnTfLiteParser -larmnnOnnxParser -L"$ARMNN_PATH" ann.cpp diff --git a/machine-learning/ann/export/build-converter.sh b/machine-learning/ann/export/build-converter.sh deleted file mode 100755 index 94e9ebec2b..0000000000 --- a/machine-learning/ann/export/build-converter.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env sh - -cd armnn-23.11/ || exit -g++ -o ../armnnconverter -O1 -DARMNN_ONNX_PARSER -DARMNN_SERIALIZER -DARMNN_TF_LITE_PARSER -fuse-ld=gold -std=c++17 -Iinclude -Isrc/armnnUtils -Ithird-party -larmnn -larmnnDeserializer -larmnnTfLiteParser -larmnnOnnxParser -larmnnSerializer -L../armnn src/armnnConverter/ArmnnConverter.cpp diff --git a/machine-learning/ann/export/run.py b/machine-learning/ann/export/run.py deleted file mode 100644 index 91c659a02c..0000000000 --- a/machine-learning/ann/export/run.py +++ /dev/null @@ -1,157 +0,0 @@ -import logging -import os -import platform -import subprocess -from abc import abstractmethod - -import onnx -import open_clip -import torch -from onnx2torch import convert -from onnxruntime.tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed -from tinynn.converter import TFLiteConverter - - -class ExportBase(torch.nn.Module): - input_shape: tuple[int, ...] - - def __init__(self, device: torch.device, name: str): - super().__init__() - self.device = device - self.name = name - self.optimize = 5 - self.nchw_transpose = False - - @abstractmethod - def forward(self, input_tensor: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor]: - pass - - def dummy_input(self) -> torch.FloatTensor: - return torch.rand((1, 3, 224, 224), device=self.device) - - -class ArcFace(ExportBase): - input_shape = (1, 3, 112, 112) - - def __init__(self, onnx_model_path: str, device: torch.device): - name, _ = os.path.splitext(os.path.basename(onnx_model_path)) - super().__init__(device, name) - onnx_model = onnx.load_model(onnx_model_path) - make_input_shape_fixed(onnx_model.graph, onnx_model.graph.input[0].name, self.input_shape) - fix_output_shapes(onnx_model) - self.model = convert(onnx_model).to(device) - if self.device.type == "cuda": - self.model = self.model.half() - - def forward(self, input_tensor: torch.Tensor) -> torch.FloatTensor: - embedding: torch.FloatTensor = self.model( - input_tensor.half() if self.device.type == "cuda" else input_tensor - ).float() - assert isinstance(embedding, torch.FloatTensor) - return embedding - - def dummy_input(self) -> torch.FloatTensor: - return torch.rand(self.input_shape, device=self.device) - - -class RetinaFace(ExportBase): - input_shape = (1, 3, 640, 640) - - def __init__(self, onnx_model_path: str, device: torch.device): - name, _ = os.path.splitext(os.path.basename(onnx_model_path)) - super().__init__(device, name) - self.optimize = 3 - self.model = convert(onnx_model_path).eval().to(device) - if self.device.type == "cuda": - self.model = self.model.half() - - def forward(self, input_tensor: torch.Tensor) -> tuple[torch.FloatTensor]: - out: torch.Tensor = self.model(input_tensor.half() if self.device.type == "cuda" else input_tensor) - return tuple(o.float() for o in out) - - def dummy_input(self) -> torch.FloatTensor: - return torch.rand(self.input_shape, device=self.device) - - -class ClipVision(ExportBase): - input_shape = (1, 3, 224, 224) - - def __init__(self, model_name: str, weights: str, device: torch.device): - super().__init__(device, model_name + "__" + weights) - self.model = open_clip.create_model( - model_name, - weights, - precision="fp16" if device.type == "cuda" else "fp32", - jit=False, - require_pretrained=True, - device=device, - ) - - def forward(self, input_tensor: torch.Tensor) -> torch.FloatTensor: - embedding: torch.Tensor = self.model.encode_image( - input_tensor.half() if self.device.type == "cuda" else input_tensor, - normalize=True, - ).float() - return embedding - - -def export(model: ExportBase) -> None: - model.eval() - for param in model.parameters(): - param.requires_grad = False - dummy_input = model.dummy_input() - model(dummy_input) - jit = torch.jit.trace(model, dummy_input) # type: ignore[no-untyped-call,attr-defined] - tflite_model_path = f"output/{model.name}.tflite" - os.makedirs("output", exist_ok=True) - - converter = TFLiteConverter( - jit, - dummy_input, - tflite_model_path, - optimize=model.optimize, - nchw_transpose=model.nchw_transpose, - ) - # segfaults on ARM, must run on x86_64 / AMD64 - converter.convert() - - armnn_model_path = f"output/{model.name}.armnn" - os.environ["LD_LIBRARY_PATH"] = "armnn" - subprocess.run( - [ - "./armnnconverter", - "-f", - "tflite-binary", - "-m", - tflite_model_path, - "-i", - "input_tensor", - "-o", - "output_tensor", - "-p", - armnn_model_path, - ] - ) - - -def main() -> None: - if platform.machine() not in ("x86_64", "AMD64"): - raise RuntimeError(f"Can only run on x86_64 / AMD64, not {platform.machine()}") - - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - if device.type != "cuda": - logging.warning( - "No CUDA available, cannot create fp16 model! proceeding to create a fp32 model (use only for testing)" - ) - models = [ - ClipVision("ViT-B-32", "openai", device), - ArcFace("buffalo_l_rec.onnx", device), - RetinaFace("buffalo_l_det.onnx", device), - ] - for model in models: - export(model) - - -if __name__ == "__main__": - with torch.no_grad(): - main() diff --git a/machine-learning/ann/export/.gitignore b/machine-learning/export/ann/.gitignore similarity index 100% rename from machine-learning/ann/export/.gitignore rename to machine-learning/export/ann/.gitignore diff --git a/machine-learning/export/ann/Dockerfile b/machine-learning/export/ann/Dockerfile new file mode 100644 index 0000000000..b6dcdc99a5 --- /dev/null +++ b/machine-learning/export/ann/Dockerfile @@ -0,0 +1,28 @@ +FROM mambaorg/micromamba:bookworm-slim@sha256:333f7598ff2c2400fb10bfe057709c68b7daab5d847143af85abcf224a07271a as builder + +ENV TRANSFORMERS_CACHE=/cache \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PATH="/opt/venv/bin:$PATH" + +WORKDIR /export/ann + +USER root +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + curl \ + git + +USER $MAMBA_USER +COPY --chown=$MAMBA_USER:$MAMBA_USER env.yaml ./ +RUN micromamba install -y -f env.yaml +COPY --chown=$MAMBA_USER:$MAMBA_USER *.sh *.cpp ./ + +ENV ARMNN_PATH=/export/ann/armnn +RUN ./download-armnn.sh && \ + ./build-converter.sh && \ + ./build.sh +COPY --chown=$MAMBA_USER:$MAMBA_USER run.py ./ + +ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] +CMD ["python", "run.py"] diff --git a/machine-learning/export/ann/ann.cpp b/machine-learning/export/ann/ann.cpp new file mode 100644 index 0000000000..d0010f690b --- /dev/null +++ b/machine-learning/export/ann/ann.cpp @@ -0,0 +1,281 @@ +#include +#include +#include + +#include "armnn/IRuntime.hpp" +#include "armnn/INetwork.hpp" +#include "armnn/Types.hpp" +#include "armnnDeserializer/IDeserializer.hpp" +#include "armnnTfLiteParser/ITfLiteParser.hpp" +#include "armnnOnnxParser/IOnnxParser.hpp" + +using namespace armnn; + +struct IOInfos +{ + std::vector inputInfos; + std::vector outputInfos; +}; + +// from https://rigtorp.se/spinlock/ +struct SpinLock +{ + std::atomic lock_ = {false}; + + void lock() + { + for (;;) + { + if (!lock_.exchange(true, std::memory_order_acquire)) + { + break; + } + while (lock_.load(std::memory_order_relaxed)) + ; + } + } + + void unlock() { lock_.store(false, std::memory_order_release); } +}; + +class Ann +{ + +public: + int load(const char *modelPath, + bool fastMath, + bool fp16, + bool saveCachedNetwork, + const char *cachedNetworkPath) + { + INetworkPtr network = loadModel(modelPath); + IOptimizedNetworkPtr optNet = OptimizeNetwork(network.get(), fastMath, fp16, saveCachedNetwork, cachedNetworkPath); + const IOInfos infos = getIOInfos(optNet.get()); + NetworkId netId; + mutex.lock(); + Status status = runtime->LoadNetwork(netId, std::move(optNet)); + mutex.unlock(); + if (status != Status::Success) + { + return -1; + } + spinLock.lock(); + ioInfos[netId] = infos; + mutexes.emplace(netId, std::make_unique()); + spinLock.unlock(); + return netId; + } + + void execute(NetworkId netId, const void **inputData, void **outputData) + { + spinLock.lock(); + const IOInfos *infos = &ioInfos[netId]; + auto m = mutexes[netId].get(); + spinLock.unlock(); + InputTensors inputTensors; + inputTensors.reserve(infos->inputInfos.size()); + size_t i = 0; + for (const BindingPointInfo &info : infos->inputInfos) + inputTensors.emplace_back(info.first, ConstTensor(info.second, inputData[i++])); + OutputTensors outputTensors; + outputTensors.reserve(infos->outputInfos.size()); + i = 0; + for (const BindingPointInfo &info : infos->outputInfos) + outputTensors.emplace_back(info.first, Tensor(info.second, outputData[i++])); + m->lock(); + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + m->unlock(); + } + + void unload(NetworkId netId) + { + mutex.lock(); + runtime->UnloadNetwork(netId); + mutex.unlock(); + } + + int tensors(NetworkId netId, bool isInput = false) + { + spinLock.lock(); + const IOInfos *infos = &ioInfos[netId]; + spinLock.unlock(); + return (int)(isInput ? infos->inputInfos.size() : infos->outputInfos.size()); + } + + unsigned long shape(NetworkId netId, bool isInput = false, int index = 0) + { + spinLock.lock(); + const IOInfos *infos = &ioInfos[netId]; + spinLock.unlock(); + const TensorShape shape = (isInput ? infos->inputInfos : infos->outputInfos)[index].second.GetShape(); + unsigned long s = 0; + for (unsigned int d = 0; d < shape.GetNumDimensions(); d++) + s |= ((unsigned long)shape[d]) << (d * 16); // stores up to 4 16-bit values in a 64-bit value + return s; + } + + Ann(int tuningLevel, const char *tuningFile) + { + IRuntime::CreationOptions runtimeOptions; + BackendOptions backendOptions{"GpuAcc", + { + {"TuningLevel", tuningLevel}, + {"MemoryOptimizerStrategy", "ConstantMemoryStrategy"}, // SingleAxisPriorityList or ConstantMemoryStrategy + }}; + if (tuningFile) + backendOptions.AddOption({"TuningFile", tuningFile}); + runtimeOptions.m_BackendOptions.emplace_back(backendOptions); + runtime = IRuntime::CreateRaw(runtimeOptions); + }; + ~Ann() + { + IRuntime::Destroy(runtime); + }; + +private: + INetworkPtr loadModel(const char *modelPath) + { + const auto path = std::string(modelPath); + if (path.rfind(".tflite") == path.length() - 7) // endsWith() + { + auto parser = armnnTfLiteParser::ITfLiteParser::CreateRaw(); + return parser->CreateNetworkFromBinaryFile(modelPath); + } + else if (path.rfind(".onnx") == path.length() - 5) // endsWith() + { + auto parser = armnnOnnxParser::IOnnxParser::CreateRaw(); + return parser->CreateNetworkFromBinaryFile(modelPath); + } + else + { + std::ifstream ifs(path, std::ifstream::in | std::ifstream::binary); + auto parser = armnnDeserializer::IDeserializer::CreateRaw(); + return parser->CreateNetworkFromBinary(ifs); + } + } + + static BindingPointInfo getInputTensorInfo(LayerBindingId inputBindingId, TensorInfo info) + { + const auto newInfo = TensorInfo{info.GetShape(), info.GetDataType(), + info.GetQuantizationScale(), + info.GetQuantizationOffset(), + true}; + return {inputBindingId, newInfo}; + } + + IOptimizedNetworkPtr OptimizeNetwork(INetwork *network, bool fastMath, bool fp16, bool saveCachedNetwork, const char *cachedNetworkPath) + { + const bool allowExpandedDims = false; + const ShapeInferenceMethod shapeInferenceMethod = ShapeInferenceMethod::ValidateOnly; + + OptimizerOptionsOpaque options; + options.SetReduceFp32ToFp16(fp16); + options.SetShapeInferenceMethod(shapeInferenceMethod); + options.SetAllowExpandedDims(allowExpandedDims); + + BackendOptions gpuAcc("GpuAcc", {{"FastMathEnabled", fastMath}}); + if (cachedNetworkPath) + { + gpuAcc.AddOption({"SaveCachedNetwork", saveCachedNetwork}); + gpuAcc.AddOption({"CachedNetworkFilePath", cachedNetworkPath}); + } + options.AddModelOption(gpuAcc); + + // No point in using ARMNN for CPU, use ONNX (quantized) instead. + // BackendOptions cpuAcc("CpuAcc", + // { + // {"FastMathEnabled", fastMath}, + // {"NumberOfThreads", 0}, + // }); + // options.AddModelOption(cpuAcc); + + BackendOptions allowExDimOpt("AllowExpandedDims", + {{"AllowExpandedDims", allowExpandedDims}}); + options.AddModelOption(allowExDimOpt); + BackendOptions shapeInferOpt("ShapeInferenceMethod", + {{"InferAndValidate", shapeInferenceMethod == ShapeInferenceMethod::InferAndValidate}}); + options.AddModelOption(shapeInferOpt); + + std::vector backends = { + BackendId("GpuAcc"), + // BackendId("CpuAcc"), + // BackendId("CpuRef"), + }; + return Optimize(*network, backends, runtime->GetDeviceSpec(), options); + } + + IOInfos getIOInfos(IOptimizedNetwork *optNet) + { + struct InfoStrategy : IStrategy + { + void ExecuteStrategy(const IConnectableLayer *layer, + const BaseDescriptor &descriptor, + const std::vector &constants, + const char *name, + const LayerBindingId id = 0) override + { + IgnoreUnused(descriptor, constants, id); + const LayerType lt = layer->GetType(); + if (lt == LayerType::Input) + ioInfos.inputInfos.push_back(getInputTensorInfo(id, layer->GetOutputSlot(0).GetTensorInfo())); + else if (lt == LayerType::Output) + ioInfos.outputInfos.push_back({id, layer->GetInputSlot(0).GetTensorInfo()}); + } + IOInfos ioInfos; + }; + + InfoStrategy infoStrategy; + optNet->ExecuteStrategy(infoStrategy); + return infoStrategy.ioInfos; + } + + IRuntime *runtime; + std::map ioInfos; + std::map> mutexes; // mutex per network to not execute the same the same network concurrently + std::mutex mutex; // global mutex for load/unload calls to the runtime + SpinLock spinLock; // fast spin lock to guard access to the ioInfos and mutexes maps +}; + +extern "C" void *init(int logLevel, int tuningLevel, const char *tuningFile) +{ + LogSeverity level = static_cast(logLevel); + ConfigureLogging(true, true, level); + + Ann *ann = new Ann(tuningLevel, tuningFile); + return ann; +} + +extern "C" void destroy(void *ann) +{ + delete ((Ann *)ann); +} + +extern "C" int load(void *ann, + const char *path, + bool fastMath, + bool fp16, + bool saveCachedNetwork, + const char *cachedNetworkPath) +{ + return ((Ann *)ann)->load(path, fastMath, fp16, saveCachedNetwork, cachedNetworkPath); +} + +extern "C" void unload(void *ann, NetworkId netId) +{ + ((Ann *)ann)->unload(netId); +} + +extern "C" void execute(void *ann, NetworkId netId, const void **inputData, void **outputData) +{ + ((Ann *)ann)->execute(netId, inputData, outputData); +} + +extern "C" unsigned long shape(void *ann, NetworkId netId, bool isInput, int index) +{ + return ((Ann *)ann)->shape(netId, isInput, index); +} + +extern "C" int tensors(void *ann, NetworkId netId, bool isInput) +{ + return ((Ann *)ann)->tensors(netId, isInput); +} \ No newline at end of file diff --git a/machine-learning/export/ann/build-converter.sh b/machine-learning/export/ann/build-converter.sh new file mode 100755 index 0000000000..9df3e23873 --- /dev/null +++ b/machine-learning/export/ann/build-converter.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env sh + +cd armnn-23.11/ || exit +g++ -o ../armnnconverter -fPIC -O1 -DARMNN_ONNX_PARSER -DARMNN_SERIALIZER -DARMNN_TF_LITE_PARSER -fuse-ld=gold -std=c++17 -Iinclude -Isrc/armnnUtils -Ithird-party -larmnn -larmnnDeserializer -larmnnTfLiteParser -larmnnOnnxParser -larmnnSerializer -L../armnn src/armnnConverter/ArmnnConverter.cpp diff --git a/machine-learning/export/ann/build.sh b/machine-learning/export/ann/build.sh new file mode 100755 index 0000000000..95bae3c274 --- /dev/null +++ b/machine-learning/export/ann/build.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env sh + +g++ -shared -O3 -fPIC -o libann.so -fuse-ld=gold -std=c++17 -I"$ARMNN_PATH"/include -larmnn -larmnnDeserializer -larmnnTfLiteParser -larmnnOnnxParser -L"$ARMNN_PATH" ann.cpp diff --git a/machine-learning/ann/export/download-armnn.sh b/machine-learning/export/ann/download-armnn.sh similarity index 100% rename from machine-learning/ann/export/download-armnn.sh rename to machine-learning/export/ann/download-armnn.sh diff --git a/machine-learning/ann/export/env.yaml b/machine-learning/export/ann/env.yaml similarity index 100% rename from machine-learning/ann/export/env.yaml rename to machine-learning/export/ann/env.yaml diff --git a/machine-learning/export/ann/run.py b/machine-learning/export/ann/run.py new file mode 100644 index 0000000000..f3f9610a99 --- /dev/null +++ b/machine-learning/export/ann/run.py @@ -0,0 +1,297 @@ +import os +import platform +import subprocess +from tempfile import TemporaryDirectory +from typing import Callable, ClassVar + +import onnx +import torch +from onnx2torch import convert +from onnx2torch.node_converters.registry import add_converter +from onnxruntime.tools.onnx_model_utils import fix_output_shapes, make_input_shape_fixed +from tinynn.converter import TFLiteConverter +from huggingface_hub import snapshot_download +from onnx2torch.onnx_graph import OnnxGraph +from onnx2torch.onnx_node import OnnxNode +from onnx2torch.utils.common import OperationConverterResult, onnx_mapping_from_node +from onnx.shape_inference import infer_shapes_path +from huggingface_hub import login, upload_file + +# egregious hacks: +# changed `Clip`'s min/max logic to skip empty strings +# changed OnnxSqueezeDynamicAxes to use `sorted` instead of `torch.sort`` +# commented out shape inference in `fix_output_shapes`` + + +class ArgMax(torch.nn.Module): + def __init__(self, dim: int = -1, keepdim: bool = False): + super().__init__() + self.dim = dim + self.keepdim = keepdim + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return torch.argmax(input, dim=self.dim, keepdim=self.keepdim) + + +class Erf(torch.nn.Module): + def forward(self, input: torch.Tensor) -> torch.Tensor: + return torch.erf(input) + + +@add_converter(operation_type="ArgMax", version=13) +def _(node: OnnxNode, graph: OnnxGraph) -> OperationConverterResult: + return OperationConverterResult( + torch_module=ArgMax(), + onnx_mapping=onnx_mapping_from_node(node=node), + ) + + +class ExportBase(torch.nn.Module): + task: ClassVar[str] + + def __init__( + self, + name: str, + input_shape: tuple[int, ...], + pretrained: str | None = None, + optimization_level: int = 5, + ): + super().__init__() + self.name = name + self.optimize = optimization_level + self.nchw_transpose = False + self.input_shape = input_shape + self.pretrained = pretrained + self.dummy_param = torch.nn.Parameter(torch.empty(0)) + self.model = self.load().eval() + for param in self.parameters(): + param.requires_grad_(False) + self.eval() + + def load(self) -> torch.nn.Module: + cache_dir = os.path.join(os.environ["CACHE_DIR"], self.model_name) + task_path = os.path.join(cache_dir, self.task) + model_path = os.path.join(task_path, "model.onnx") + if not os.path.isfile(model_path): + snapshot_download(self.repo_name, cache_dir=cache_dir, local_dir=cache_dir) + infer_shapes_path(model_path, check_type=True, strict_mode=True, data_prop=True) + onnx_model = onnx.load_model(model_path) + make_input_shape_fixed(onnx_model.graph, onnx_model.graph.input[0].name, self.input_shape) + fix_output_shapes(onnx_model) + # try: + # onnx.save(onnx_model, model_path) + # except: + # onnx.save(onnx_model, model_path, save_as_external_data=True, all_tensors_to_one_file=False) + # infer_shapes_path(model_path, check_type=True, strict_mode=True, data_prop=True) + # onnx_model = onnx.load_model(model_path) + # onnx_model = infer_shapes(onnx_model, check_type=True, strict_mode=True, data_prop=True) + return convert(onnx_model) + + def forward(self, *inputs: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor]: + if self.precision == "fp16": + inputs = tuple(i.half() for i in inputs) + + out = self._forward(*inputs) + if self.precision == "fp16": + if isinstance(out, tuple): + return tuple(o.float() for o in out) + return out.float() + return out + + def _forward(self, *inputs: torch.Tensor) -> torch.Tensor | tuple[torch.Tensor]: + return self.model(*inputs) + + def to_armnn(self, output_path: str) -> None: + output_dir = os.path.dirname(output_path) + os.makedirs(output_dir, exist_ok=True) + self(*self.dummy_inputs) + print(f"Exporting {self.model_name} ({self.task}) with {self.precision} precision") + jit = torch.jit.trace(self, self.dummy_inputs).eval() + with TemporaryDirectory() as tmpdir: + tflite_model_path = os.path.join(tmpdir, "model.tflite") + converter = TFLiteConverter( + jit, + self.dummy_inputs, + tflite_model_path, + optimize=self.optimize, + nchw_transpose=self.nchw_transpose, + ) + # segfaults on ARM, must run on x86_64 / AMD64 + converter.convert() + + subprocess.run( + [ + "./armnnconverter", + "-f", + "tflite-binary", + "-m", + tflite_model_path, + "-i", + "input_tensor", + "-o", + "output_tensor", + "-p", + output_path, + ], + capture_output=True, + ) + print(f"Finished exporting {self.name} ({self.task}) with {self.precision} precision") + + @property + def dummy_inputs(self) -> tuple[torch.FloatTensor]: + return (torch.rand(self.input_shape, device=self.device, dtype=self.dtype),) + + @property + def model_name(self) -> str: + return f"{self.name}__{self.pretrained}" if self.pretrained else self.name + + @property + def repo_name(self) -> str: + return f"immich-app/{self.model_name}" + + @property + def device(self) -> torch.device: + return self.dummy_param.device + + @property + def dtype(self) -> torch.dtype: + return self.dummy_param.dtype + + @property + def precision(self) -> str: + match self.dtype: + case torch.float32: + return "fp32" + case torch.float16: + return "fp16" + case _: + raise ValueError(f"Unsupported dtype {self.dtype}") + + +class ArcFace(ExportBase): + task = "recognition" + + +class RetinaFace(ExportBase): + task = "detection" + + +class OpenClipVisual(ExportBase): + task = "visual" + + +class OpenClipTextual(ExportBase): + task = "textual" + + @property + def dummy_inputs(self) -> tuple[torch.LongTensor]: + return (torch.randint(0, 5000, self.input_shape, device=self.device, dtype=torch.int32),) + + +class MClipTextual(ExportBase): + task = "textual" + + @property + def dummy_inputs(self) -> tuple[torch.LongTensor]: + return ( + torch.randint(0, 5000, self.input_shape, device=self.device, dtype=torch.int32), + torch.randint(0, 1, self.input_shape, device=self.device, dtype=torch.int32), + ) + + +def main() -> None: + if platform.machine() not in ("x86_64", "AMD64"): + raise RuntimeError(f"Can only run on x86_64 / AMD64, not {platform.machine()}") + login(token=os.environ["HF_AUTH_TOKEN"]) + os.environ["LD_LIBRARY_PATH"] = "armnn" + device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + failed: list[Callable[[], ExportBase]] = [ + lambda: OpenClipVisual("ViT-H-14-378-quickgelu", (1, 3, 378, 378), pretrained="dfn5b"), # flatbuffers: cannot grow buffer beyond 2 gigabytes (will probably work with fp16) + lambda: OpenClipVisual("ViT-H-14-quickgelu", (1, 3, 224, 224), pretrained="dfn5b"), # flatbuffers: cannot grow buffer beyond 2 gigabytes (will probably work with fp16) + lambda: OpenClipTextual("nllb-clip-base-siglip", (1, 77), pretrained="v1"), # ERROR (tinynn.converter.base) Unsupported ops: aten::logical_not + lambda: OpenClipTextual("nllb-clip-large-siglip", (1, 77), pretrained="v1"), # ERROR (tinynn.converter.base) Unsupported ops: aten::logical_not + lambda: OpenClipVisual("ViT-B-32", (1, 3, 224, 224), pretrained="laion2b_e16"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-B-32", (1, 77), pretrained="laion2b_e16"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-B-32", (1, 3, 224, 224), pretrained="laion400m_e31"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-B-32", (1, 77), pretrained="laion400m_e31"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-B-32", (1, 3, 224, 224), pretrained="laion400m_e32"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-B-32", (1, 77), pretrained="laion400m_e32"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-B-32", (1, 3, 224, 224), pretrained="laion2b-s34b-b79k"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-B-32", (1, 77), pretrained="laion2b-s34b-b79k"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-B-16", (1, 3, 224, 224), pretrained="laion400m_e31"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-B-16", (1, 77), pretrained="laion400m_e31"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-B-16", (1, 3, 224, 224), pretrained="laion400m_e32"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-B-16", (1, 77), pretrained="laion400m_e32"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-B-16-plus-240", (1, 3, 224, 224), pretrained="laion400m_e31"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-B-16-plus-240", (1, 77), pretrained="laion400m_e31"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-L-14", (1, 3, 224, 224), pretrained="laion400m_e31"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-L-14", (1, 77), pretrained="laion400m_e31"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-L-14", (1, 3, 224, 224), pretrained="laion400m_e32"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-L-14", (1, 77), pretrained="laion400m_e32"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-L-14", (1, 3, 224, 224), pretrained="laion2b-s32b-b82k"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-L-14", (1, 77), pretrained="laion2b-s32b-b82k"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-H-14", (1, 3, 224, 224), pretrained="laion2b-s32b-b79k"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-H-14", (1, 77), pretrained="laion2b-s32b-b79k"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("ViT-g-14", (1, 3, 224, 224), pretrained="laion2b-s12b-b42k"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipTextual("ViT-g-14", (1, 77), pretrained="laion2b-s12b-b42k"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("XLM-Roberta-Large-Vit-B-16Plus", (1, 3, 240, 240)), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("XLM-Roberta-Large-ViT-H-14", (1, 3, 224, 224), pretrained="frozen_laion5b_s13b_b90k"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("nllb-clip-base-siglip", (1, 3, 384, 384), pretrained="v1"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("nllb-clip-large-siglip", (1, 3, 384, 384), pretrained="v1"), # ERROR (tinynn.converter.base) Unsupported ops: aten::erf + lambda: OpenClipVisual("RN50", (1, 3, 224, 224), pretrained="yfcc15m"), # BatchNorm operation with mean/var output is not implemented + lambda: OpenClipTextual("RN50", (1, 77), pretrained="yfcc15m"), # BatchNorm operation with mean/var output is not implemented + lambda: OpenClipVisual("RN50", (1, 3, 224, 224), pretrained="cc12m"), # BatchNorm operation with mean/var output is not implemented + lambda: OpenClipTextual("RN50", (1, 77), pretrained="cc12m"), # BatchNorm operation with mean/var output is not implemented + lambda: MClipTextual("XLM-Roberta-Large-Vit-L-14", (1, 77)), # Expected normalized_shape to be at least 1-dimensional, i.e., containing at least one element, but got normalized_shape = [] + lambda: MClipTextual("XLM-Roberta-Large-Vit-B-16Plus", (1, 77)), # Expected normalized_shape to be at least 1-dimensional, i.e., containing at least one element, but got normalized_shape = [] + lambda: MClipTextual("LABSE-Vit-L-14", (1, 77)), # Expected normalized_shape to be at least 1-dimensional, i.e., containing at least one element, but got normalized_shape = [] + lambda: OpenClipTextual("XLM-Roberta-Large-ViT-H-14", (1, 77), pretrained="frozen_laion5b_s13b_b90k"), # Expected normalized_shape to be at least 1-dimensional, i.e., containing at least one element, but got normalized_shape = [] + ] + + succeeded: list[Callable[[], ExportBase]] = [ + lambda: OpenClipVisual("ViT-B-32", (1, 3, 224, 224), pretrained="openai"), + lambda: OpenClipTextual("ViT-B-32", (1, 77), pretrained="openai"), + lambda: OpenClipVisual("ViT-B-16", (1, 3, 224, 224), pretrained="openai"), + lambda: OpenClipTextual("ViT-B-16", (1, 77), pretrained="openai"), + lambda: OpenClipVisual("ViT-L-14", (1, 3, 224, 224), pretrained="openai"), + lambda: OpenClipTextual("ViT-L-14", (1, 77), pretrained="openai"), + lambda: OpenClipVisual("ViT-L-14-336", (1, 3, 336, 336), pretrained="openai"), + lambda: OpenClipTextual("ViT-L-14-336", (1, 77), pretrained="openai"), + lambda: OpenClipVisual("RN50", (1, 3, 224, 224), pretrained="openai"), + lambda: OpenClipTextual("RN50", (1, 77), pretrained="openai"), + lambda: OpenClipTextual("ViT-H-14-quickgelu", (1, 77), pretrained="dfn5b"), + lambda: OpenClipTextual("ViT-H-14-378-quickgelu", (1, 77), pretrained="dfn5b"), + lambda: OpenClipVisual("XLM-Roberta-Large-Vit-L-14", (1, 3, 224, 224)), + lambda: OpenClipVisual("XLM-Roberta-Large-Vit-B-32", (1, 3, 224, 224)), + lambda: ArcFace("buffalo_s", (1, 3, 112, 112), optimization_level=3), + lambda: RetinaFace("buffalo_s", (1, 3, 640, 640), optimization_level=3), + lambda: ArcFace("buffalo_m", (1, 3, 112, 112), optimization_level=3), + lambda: RetinaFace("buffalo_m", (1, 3, 640, 640), optimization_level=3), + lambda: ArcFace("buffalo_l", (1, 3, 112, 112), optimization_level=3), + lambda: RetinaFace("buffalo_l", (1, 3, 640, 640), optimization_level=3), + lambda: ArcFace("antelopev2", (1, 3, 112, 112), optimization_level=3), + lambda: RetinaFace("antelopev2", (1, 3, 640, 640), optimization_level=3), + ] + + models: list[Callable[[], ExportBase]] = [*failed, *succeeded] + for _model in succeeded: + model = _model().to(device) + try: + relative_path = os.path.join(model.task, "model.armnn") + output_path = os.path.join("output", model.model_name, relative_path) + model.to_armnn(output_path) + upload_file(path_or_fileobj=output_path, path_in_repo=relative_path, repo_id=model.repo_name) + if device == torch.device("cuda"): + model.half() + relative_path = os.path.join(model.task, "fp16", "model.armnn") + output_path = os.path.join("output", model.model_name, relative_path) + model.to_armnn(output_path) + upload_file(path_or_fileobj=output_path, path_in_repo=relative_path, repo_id=model.repo_name) + + except Exception as exc: + print(f"Failed to export {model.model_name} ({model.task}): {exc}") + + +if __name__ == "__main__": + with torch.no_grad(): + main() diff --git a/machine-learning/export/Dockerfile b/machine-learning/export/ort/Dockerfile similarity index 100% rename from machine-learning/export/Dockerfile rename to machine-learning/export/ort/Dockerfile diff --git a/machine-learning/export/conda-lock.yml b/machine-learning/export/ort/conda-lock.yml similarity index 100% rename from machine-learning/export/conda-lock.yml rename to machine-learning/export/ort/conda-lock.yml diff --git a/machine-learning/export/env.dev.yaml b/machine-learning/export/ort/env.dev.yaml similarity index 100% rename from machine-learning/export/env.dev.yaml rename to machine-learning/export/ort/env.dev.yaml diff --git a/machine-learning/export/env.yaml b/machine-learning/export/ort/env.yaml similarity index 100% rename from machine-learning/export/env.yaml rename to machine-learning/export/ort/env.yaml diff --git a/machine-learning/export/models/__init__.py b/machine-learning/export/ort/models/__init__.py similarity index 100% rename from machine-learning/export/models/__init__.py rename to machine-learning/export/ort/models/__init__.py diff --git a/machine-learning/export/models/mclip.py b/machine-learning/export/ort/models/mclip.py similarity index 73% rename from machine-learning/export/models/mclip.py rename to machine-learning/export/ort/models/mclip.py index 565539016a..97d1c0c5f3 100644 --- a/machine-learning/export/models/mclip.py +++ b/machine-learning/export/ort/models/mclip.py @@ -19,37 +19,44 @@ _MCLIP_TO_OPENCLIP = { } +def forward(self: MultilingualCLIP, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: + embs = self.transformer(input_ids, attention_mask)[0] + embs = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None] + embs = self.LinearTransformation(embs) + return torch.nn.functional.normalize(embs, dim=-1) + +# unfortunately need to monkeypatch for tracing to work here +# otherwise it hits the 2GiB protobuf serialization limit +MultilingualCLIP.forward = forward + + +def to_torchscript(model_name: str) -> torch.jit.ScriptModule: + with tempfile.TemporaryDirectory() as tmpdir: + model = MultilingualCLIP.from_pretrained(model_name, cache_dir=tmpdir) + + model.eval() + for param in model.parameters(): + param.requires_grad_(False) + + return model + + def to_onnx( model_name: str, output_dir_visual: Path | str, output_dir_textual: Path | str, ) -> None: textual_path = get_model_path(output_dir_textual) - with tempfile.TemporaryDirectory() as tmpdir: - model = MultilingualCLIP.from_pretrained(model_name, cache_dir=tmpdir) - AutoTokenizer.from_pretrained(model_name).save_pretrained(output_dir_textual) + model = to_torchscript(model_name) + AutoTokenizer.from_pretrained(model_name).save_pretrained(output_dir_textual) - for param in model.parameters(): - param.requires_grad_(False) - - export_text_encoder(model, textual_path) - openclip_to_onnx(_MCLIP_TO_OPENCLIP[model_name], output_dir_visual) - optimize(textual_path) + _text_encoder_to_onnx(model, textual_path) + openclip_to_onnx(_MCLIP_TO_OPENCLIP[model_name], output_dir_visual) + optimize(textual_path) -def export_text_encoder(model: MultilingualCLIP, output_path: Path | str) -> None: +def _text_encoder_to_onnx(model: MultilingualCLIP, output_path: Path | str) -> None: output_path = Path(output_path) - - def forward(self: MultilingualCLIP, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor: - embs = self.transformer(input_ids, attention_mask)[0] - embs = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None] - embs = self.LinearTransformation(embs) - return torch.nn.functional.normalize(embs, dim=-1) - - # unfortunately need to monkeypatch for tracing to work here - # otherwise it hits the 2GiB protobuf serialization limit - MultilingualCLIP.forward = forward - args = (torch.ones(1, 77, dtype=torch.int32), torch.ones(1, 77, dtype=torch.int32)) with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) diff --git a/machine-learning/export/models/openclip.py b/machine-learning/export/ort/models/openclip.py similarity index 83% rename from machine-learning/export/models/openclip.py rename to machine-learning/export/ort/models/openclip.py index d5d2b3ef5d..11337c5468 100644 --- a/machine-learning/export/models/openclip.py +++ b/machine-learning/export/ort/models/openclip.py @@ -26,6 +26,17 @@ class OpenCLIPModelConfig: self.sequence_length = open_clip_cfg["text_cfg"]["context_length"] +def to_torchscript(model_name: str) -> torch.jit.ScriptModule: + with tempfile.TemporaryDirectory() as tmpdir: + model = MultilingualCLIP.from_pretrained(model_name, cache_dir=tmpdir) + + model.eval() + for param in model.parameters(): + param.requires_grad_(False) + + return model + + def to_onnx( model_cfg: OpenCLIPModelConfig, output_dir_visual: Path | str | None = None, @@ -51,7 +62,7 @@ def to_onnx( save_config(open_clip.get_model_preprocess_cfg(model), output_dir_visual / "preprocess_cfg.json") save_config(text_vision_cfg, output_dir_visual.parent / "config.json") - export_image_encoder(model, model_cfg, visual_path) + _image_encoder_to_onnx(model, model_cfg, visual_path) optimize(visual_path) @@ -61,11 +72,11 @@ def to_onnx( tokenizer_name = text_vision_cfg["text_cfg"].get("hf_tokenizer_name", "openai/clip-vit-base-patch32") AutoTokenizer.from_pretrained(tokenizer_name).save_pretrained(output_dir_textual) - export_text_encoder(model, model_cfg, textual_path) + _text_encoder_to_onnx(model, model_cfg, textual_path) optimize(textual_path) -def export_image_encoder(model: open_clip.CLIP, model_cfg: OpenCLIPModelConfig, output_path: Path | str) -> None: +def _image_encoder_to_onnx(model: open_clip.CLIP, model_cfg: OpenCLIPModelConfig, output_path: Path | str) -> None: output_path = Path(output_path) def encode_image(image: torch.Tensor) -> torch.Tensor: @@ -89,7 +100,7 @@ def export_image_encoder(model: open_clip.CLIP, model_cfg: OpenCLIPModelConfig, ) -def export_text_encoder(model: open_clip.CLIP, model_cfg: OpenCLIPModelConfig, output_path: Path | str) -> None: +def _text_encoder_to_onnx(model: open_clip.CLIP, model_cfg: OpenCLIPModelConfig, output_path: Path | str) -> None: output_path = Path(output_path) def encode_text(text: torch.Tensor) -> torch.Tensor: diff --git a/machine-learning/export/models/optimize.py b/machine-learning/export/ort/models/optimize.py similarity index 100% rename from machine-learning/export/models/optimize.py rename to machine-learning/export/ort/models/optimize.py diff --git a/machine-learning/export/models/util.py b/machine-learning/export/ort/models/util.py similarity index 100% rename from machine-learning/export/models/util.py rename to machine-learning/export/ort/models/util.py diff --git a/machine-learning/export/run.py b/machine-learning/export/ort/run.py similarity index 100% rename from machine-learning/export/run.py rename to machine-learning/export/ort/run.py