# Copyright 2023 MOSEC Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Managers to control Coordinator and Mosec process."""
import multiprocessing as mp
import subprocess
from importlib.resources import files as importlib_files
from multiprocessing.context import ForkContext, SpawnContext
from multiprocessing.process import BaseProcess
from multiprocessing.synchronize import Event
from pathlib import Path
from time import monotonic, sleep
from typing import Callable, Dict, Iterable, List, Optional, Type, Union, cast
from mosec.coordinator import Coordinator
from mosec.env import env_var_context, validate_env, validate_int_ge
from mosec.log import get_internal_logger
from mosec.worker import Worker
logger = get_internal_logger()
NEW_PROCESS_METHOD = {"spawn", "fork"}
# pylint: disable=too-many-instance-attributes
# pylint: disable=too-many-arguments
# pylint: disable=too-few-public-methods
[docs]
class Runtime:
"""The wrapper with one worker and its arguments."""
# count how many runtime instances have been created
_stage_id: int = 0
[docs]
def __init__(
self,
worker: Type[Worker],
num: int = 1,
max_batch_size: int = 1,
max_wait_time: int = 10,
timeout: int = 3,
start_method: str = "spawn",
env: Union[None, List[Dict[str, str]]] = None,
):
"""Initialize the mosec coordinator.
Args:
worker (Worker): subclass of `mosec.Worker` implemented by users.
num (int): number of workers
max_batch_size: the maximum batch size allowed (>=1), will enable the
dynamic batching if it > 1
max_wait_time (int): the maximum wait time (millisecond)
for dynamic batching, needs to be used with `max_batch_size`
to enable the feature. If not configure, will use the CLI
argument `--wait` (default=10ms)
timeout (int): timeout (second) for the `forward` function.
start_method: the process starting method ("spawn" or "fork")
env: the environment variables to set before starting the process
"""
self.worker = worker
self.num = num
self.max_batch_size = max_batch_size
self.max_wait_time = max_wait_time
self.timeout = timeout
self.start_method = start_method
self.env = env
Runtime._stage_id += 1
# adding the stage id in case the worker class is added to multiple stages
self.name = f"{self.worker.__name__}_{self._stage_id}"
self._pool: List[Union[BaseProcess, None]] = [None for _ in range(self.num)]
self._validate()
@staticmethod
def _process_healthy(process: Union[BaseProcess, None]) -> bool:
"""Check if the child process is healthy.
ref: https://docs.python.org/3/library/multiprocessing.html
The exit code will be None if the process has not yet terminated.
"""
return process is not None and process.exitcode is None
def _healthy(self, method: Callable[[Iterable[object]], bool]) -> bool:
"""Check if all/any of the child processes are healthy."""
return method(self._pool)
def _start_process(
self,
worker_id: int,
work_path: str,
shutdown: Event,
shutdown_notify: Event,
):
"""Start a worker process in the context."""
context = mp.get_context(self.start_method)
context = cast(Union[SpawnContext, ForkContext], context)
coordinator_process = context.Process(
target=Coordinator,
args=(
self.worker,
self.max_batch_size,
shutdown,
shutdown_notify,
work_path,
self.name,
worker_id + 1,
self.timeout,
),
daemon=True,
)
with env_var_context(self.env, worker_id):
coordinator_process.start()
self._pool[worker_id] = coordinator_process
def _check(
self,
work_path: str,
shutdown: Event,
shutdown_notify: Event,
init: bool,
) -> bool:
"""Check and start the worker process if it has not started yet.
Args:
work_path: path of working directory
shutdown: Event of server shutdown
shutdown_notify: Event of server will shutdown
init: whether the worker is tried to start at the first time
Returns:
Whether the worker is started successfully
"""
# for every sequential stage
self._pool = [p if self._process_healthy(p) else None for p in self._pool]
if self._healthy(all):
# this stage is healthy
return True
if not init and not self._healthy(any):
# this stage might contain bugs
return False
need_start_id = [
worker_id for worker_id in range(self.num) if self._pool[worker_id] is None
]
for worker_id in need_start_id:
# for every worker in each stage
self._start_process(worker_id, work_path, shutdown, shutdown_notify)
return True
def _validate(self):
"""Validate arguments of worker runtime."""
validate_env(self.env, self.num)
assert issubclass(
self.worker, Worker
), "worker must be inherited from mosec.Worker"
validate_int_ge(self.num, "worker number")
validate_int_ge(self.max_batch_size, "maximum batch size")
validate_int_ge(self.max_wait_time, "maximum wait time", 0)
validate_int_ge(self.timeout, "forward timeout", 0)
assert (
self.start_method in NEW_PROCESS_METHOD
), f"start method must be one of {NEW_PROCESS_METHOD}"
class PyRuntimeManager:
"""The manager to control coordinator process."""
def __init__(self, work_path: str, shutdown: Event, shutdown_notify: Event):
"""Initialize a coordinator manager.
Args:
work_path: path of working directory
shutdown: Event of server shutdown
shutdown_notify: Event of server will shutdown
"""
self.runtimes: List[Runtime] = []
self._work_path = work_path
self.shutdown = shutdown
self.shutdown_notify = shutdown_notify
@property
def worker_count(self) -> int:
"""Get number of workers."""
return len(self.runtimes)
@property
def workers(self) -> List[Type[Worker]]:
"""Get List of workers."""
return [r.worker for r in self.runtimes]
def append(self, runtime: Runtime):
"""Sequentially appends workers to the workflow pipeline."""
self.runtimes.append(runtime)
def check_and_start(self, init: bool) -> Union[Runtime, None]:
"""Check all worker processes and try to start failed ones.
Args:
init: whether the worker is tried to start at the first time
"""
for worker_runtime in self.runtimes:
if not worker_runtime._check( # pylint: disable=protected-access
self._work_path, self.shutdown, self.shutdown_notify, init
):
return worker_runtime
return None
class RsRuntimeManager:
"""The manager to control Mosec process."""
def __init__(self, timeout: int):
"""Initialize a Mosec manager.
Args:
timeout: service timeout in milliseconds
"""
self.process: Optional[subprocess.Popen] = None
self.server_path = importlib_files("mosec") / "bin" / "mosec"
self.timeout = timeout
def halt(self):
"""Graceful shutdown."""
# terminate controller first and wait for a graceful period
if self.process is None:
return
self.process.terminate()
graceful_period = monotonic() + self.timeout / 1000
while monotonic() < graceful_period:
ctr_exitcode = self.process.poll()
if ctr_exitcode is None:
sleep(0.1)
continue
# exited
if ctr_exitcode: # on error
logger.error("mosec service halted on error [%d]", ctr_exitcode)
else:
logger.info("mosec service halted normally [%d]", ctr_exitcode)
break
if monotonic() > graceful_period:
logger.error("failed to terminate mosec service, will try to kill it")
self.process.kill()
def start(self, config_path: Path) -> subprocess.Popen:
"""Start the Mosec process.
Args:
config_path: configuration path of mosec
"""
# pylint: disable=consider-using-with
self.process = subprocess.Popen([str(self.server_path), config_path])
return self.process