diff --git a/docs/docs/install/environment-variables.md b/docs/docs/install/environment-variables.md index e606d03dee..78a5289bf4 100644 --- a/docs/docs/install/environment-variables.md +++ b/docs/docs/install/environment-variables.md @@ -171,6 +171,7 @@ Redis (Sentinel) URL example JSON before encoding: | `MACHINE_LEARNING_MAX_BATCH_SIZE__FACIAL_RECOGNITION` | Set the maximum number of faces that will be processed at once by the facial recognition model | None (`1` if using OpenVINO) | machine learning | | `MACHINE_LEARNING_RKNN` | Enable RKNN hardware acceleration if supported | `True` | machine learning | | `MACHINE_LEARNING_RKNN_THREADS` | How many threads of RKNN runtime should be spinned up while inferencing. | `1` | machine learning | +| `MACHINE_LEARNING_MODEL_ARENA` | Pre-allocates CPU memory to avoid memory fragmentation | true | machine learning | \*1: It is recommended to begin with this parameter when changing the concurrency levels of the machine learning service and then tune the other ones. diff --git a/machine-learning/Dockerfile b/machine-learning/Dockerfile index e4ed643375..f913fd3c78 100644 --- a/machine-learning/Dockerfile +++ b/machine-learning/Dockerfile @@ -70,7 +70,8 @@ RUN if [ "$DEVICE" = "rocm" ]; then \ FROM python:3.11-slim-bookworm@sha256:873f91540d53b36327ed4fb018c9669107a4e2a676719720edb4209c4b15d029 AS prod-cpu -ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2 +ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2 \ + MACHINE_LEARNING_MODEL_ARENA=false FROM python:3.11-slim-bookworm@sha256:873f91540d53b36327ed4fb018c9669107a4e2a676719720edb4209c4b15d029 AS prod-openvino @@ -88,7 +89,8 @@ RUN apt-get update && \ FROM nvidia/cuda:12.2.2-runtime-ubuntu22.04@sha256:94c1577b2cd9dd6c0312dc04dff9cb2fdce2b268018abc3d7c2dbcacf1155000 AS prod-cuda -ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2 +ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2 \ + MACHINE_LEARNING_MODEL_ARENA=false RUN apt-get update && \ apt-get install --no-install-recommends -yqq libcudnn9-cuda-12 && \ @@ -104,7 +106,8 @@ FROM rocm/dev-ubuntu-22.04:6.4.3-complete@sha256:1f7e92ca7e3a3785680473329ed1091 FROM prod-cpu AS prod-armnn ENV LD_LIBRARY_PATH=/opt/armnn \ - LD_PRELOAD=/usr/lib/libmimalloc.so.2 + LD_PRELOAD=/usr/lib/libmimalloc.so.2 \ + MACHINE_LEARNING_MODEL_ARENA=false RUN apt-get update && apt-get install -y --no-install-recommends ocl-icd-libopencl1 mesa-opencl-icd libgomp1 && \ rm -rf /var/lib/apt/lists/* && \ @@ -127,7 +130,8 @@ FROM prod-cpu AS prod-rknn # renovate: datasource=github-tags depName=airockchip/rknn-toolkit2 ARG RKNN_TOOLKIT_VERSION="v2.3.0" -ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2 +ENV LD_PRELOAD=/usr/lib/libmimalloc.so.2 \ + MACHINE_LEARNING_MODEL_ARENA=false ADD --checksum=sha256:73993ed4b440460825f21611731564503cc1d5a0c123746477da6cd574f34885 "https://github.com/airockchip/rknn-toolkit2/raw/refs/tags/${RKNN_TOOLKIT_VERSION}/rknpu2/runtime/Linux/librknn_api/aarch64/librknnrt.so" /usr/lib/ diff --git a/machine-learning/immich_ml/config.py b/machine-learning/immich_ml/config.py index 939afbc98b..d9edd88a8c 100644 --- a/machine-learning/immich_ml/config.py +++ b/machine-learning/immich_ml/config.py @@ -61,6 +61,7 @@ class Settings(BaseSettings): request_threads: int = os.cpu_count() or 4 model_inter_op_threads: int = 0 model_intra_op_threads: int = 0 + model_arena: bool = True ann: bool = True ann_fp16_turbo: bool = False ann_tuning_level: int = 2 diff --git a/machine-learning/immich_ml/models/constants.py b/machine-learning/immich_ml/models/constants.py index 41b0990f71..b15b75b7d1 100644 --- a/machine-learning/immich_ml/models/constants.py +++ b/machine-learning/immich_ml/models/constants.py @@ -79,6 +79,7 @@ SUPPORTED_PROVIDERS = [ "CUDAExecutionProvider", "ROCMExecutionProvider", "OpenVINOExecutionProvider", + "CoreMLExecutionProvider", "CPUExecutionProvider", ] diff --git a/machine-learning/immich_ml/sessions/ort.py b/machine-learning/immich_ml/sessions/ort.py index e7d8635876..d18aae751a 100644 --- a/machine-learning/immich_ml/sessions/ort.py +++ b/machine-learning/immich_ml/sessions/ort.py @@ -96,6 +96,14 @@ class OrtSession: "precision": "FP32", "cache_dir": (self.model_path.parent / "openvino").as_posix(), } + case "CoreMLExecutionProvider": + options = { + "ModelFormat": "MLProgram", + "MLComputeUnits": "ALL", + "SpecializationStrategy": "FastPrediction", + "AllowLowPrecisionAccumulationOnGPU": "1", + "ModelCacheDirectory": (self.model_path.parent / "coreml").as_posix(), + } case _: options = {} provider_options.append(options) @@ -115,7 +123,7 @@ class OrtSession: @property def _sess_options_default(self) -> ort.SessionOptions: sess_options = ort.SessionOptions() - sess_options.enable_cpu_mem_arena = False + sess_options.enable_cpu_mem_arena = settings.model_arena # avoid thread contention between models if settings.model_inter_op_threads > 0: diff --git a/machine-learning/test_main.py b/machine-learning/test_main.py index eeafd01062..582a05a950 100644 --- a/machine-learning/test_main.py +++ b/machine-learning/test_main.py @@ -180,6 +180,7 @@ class TestOrtSession: CUDA_EP_OUT_OF_ORDER = ["CPUExecutionProvider", "CUDAExecutionProvider"] TRT_EP = ["TensorrtExecutionProvider", "CUDAExecutionProvider", "CPUExecutionProvider"] ROCM_EP = ["ROCMExecutionProvider", "CPUExecutionProvider"] + COREML_EP = ["CoreMLExecutionProvider", "CPUExecutionProvider"] @pytest.mark.providers(CPU_EP) def test_sets_cpu_provider(self, providers: list[str]) -> None: @@ -225,6 +226,12 @@ class TestOrtSession: assert session.providers == self.ROCM_EP + @pytest.mark.providers(COREML_EP) + def test_uses_coreml(self, providers: list[str]) -> None: + session = OrtSession("ViT-B-32__openai") + + assert session.providers == self.COREML_EP + def test_sets_provider_kwarg(self) -> None: providers = ["CUDAExecutionProvider"] session = OrtSession("ViT-B-32__openai", providers=providers) @@ -284,7 +291,6 @@ class TestOrtSession: assert session.sess_options.execution_mode == ort.ExecutionMode.ORT_SEQUENTIAL assert session.sess_options.inter_op_num_threads == 1 assert session.sess_options.intra_op_num_threads == 2 - assert session.sess_options.enable_cpu_mem_arena is False def test_sets_default_sess_options_does_not_set_threads_if_non_cpu_and_default_threads(self) -> None: session = OrtSession("ViT-B-32__openai", providers=["CUDAExecutionProvider", "CPUExecutionProvider"]) @@ -302,6 +308,26 @@ class TestOrtSession: assert session.sess_options.inter_op_num_threads == 2 assert session.sess_options.intra_op_num_threads == 4 + def test_uses_arena_if_enabled(self, mocker: MockerFixture) -> None: + mock_settings = mocker.patch("immich_ml.sessions.ort.settings", autospec=True) + mock_settings.model_inter_op_threads = 0 + mock_settings.model_intra_op_threads = 0 + mock_settings.model_arena = True + + session = OrtSession("ViT-B-32__openai", providers=["CPUExecutionProvider"]) + + assert session.sess_options.enable_cpu_mem_arena + + def test_does_not_use_arena_if_disabled(self, mocker: MockerFixture) -> None: + mock_settings = mocker.patch("immich_ml.sessions.ort.settings", autospec=True) + mock_settings.model_inter_op_threads = 0 + mock_settings.model_intra_op_threads = 0 + mock_settings.model_arena = False + + session = OrtSession("ViT-B-32__openai", providers=["CPUExecutionProvider"]) + + assert not session.sess_options.enable_cpu_mem_arena + def test_sets_sess_options_kwarg(self) -> None: sess_options = ort.SessionOptions() session = OrtSession(