Source code for hypernets.hyperctl.executor

import os
import io
import time
import subprocess
import tempfile
from multiprocessing import cpu_count
from pathlib import Path
from typing import List

from paramiko import SFTPClient

from hypernets.hyperctl import consts
from hypernets.hyperctl.utils import http_portal
from hypernets.hyperctl.batch import _ShellJob
from hypernets.utils import logging as hyn_logging
from hypernets.utils import ssh_utils
import shutil


logger = hyn_logging.get_logger(__name__)


[docs]class NoResourceException(Exception):
    pass


[docs]class ShellExecutor:

    def __init__(self, job: _ShellJob, api_server_portal, environments=None):
        self.job = job
        self.api_server_portal = api_server_portal
        self.environments = environments

[docs]    def run(self, *, independent_tmp=True):
        pass

[docs]    def post(self):
        pass

[docs]    def prepare(self):
        pass

[docs]    def close(self):
        pass

[docs]    def status(self):
        raise NotImplemented

    def _make_run_shell_content(self, independent_tmp):

        template_vars = {
            consts.KEY_ENV_JOB_DATA_DIR: self.job.data_dir_path.as_posix(),
            consts.KEY_ENV_JOB_NAME: self.job.name,
            consts.KEY_ENV_JOB_WORKING_DIR: self.job.working_dir,  # default value
            consts.KEY_TEMPLATE_COMMAND: self.job.batch.job_command,
            consts.KEY_ENV_SERVER_PORTAL: self.api_server_portal
        }

        run_shell = str(consts.RUN_SH_TEMPLATE)
        for k, v in template_vars.items():
            run_shell = run_shell.replace(f"#{k}#", v)

        export_custom_envs_command = ""
        if self.environments is not None:
            for k, v in self.environments.items():
                export_command = f"export {k}={v}\n"
                export_custom_envs_command = export_custom_envs_command + export_command

        export_tmp_command = ""
        if independent_tmp:
            independent_tmp_path = (Path(tempfile.gettempdir()) / consts.BATCH_TEMP / self.job.name).as_posix()
            export_tmp_command = f"export TMP={independent_tmp_path}"

        run_shell = run_shell.replace(f"#{consts.P_HOST_ENV}", export_custom_envs_command)\
            .replace(f"#{consts.P_TMP_ENV}", export_tmp_command)

        return run_shell

    def _write_run_shell_script(self, file_path, independent_tmp):
        file_path = Path(file_path)
        # check parent
        if not file_path.parent.exists():
            os.makedirs(Path(file_path.parent), exist_ok=True)
        # write file
        shell_content = self._make_run_shell_content(independent_tmp)
        logger.debug(f'write shell script for job {self.job.name} to {file_path} ')
        with open(file_path, 'w', newline='\n') as f:
            f.write(shell_content)


[docs]class LocalShellExecutor(ShellExecutor):

    def __init__(self, *args, **kwargs):
        super(LocalShellExecutor, self).__init__(*args, **kwargs)
        self._process = None

[docs]    def prepare_assets(self):
        if len(self.job.assets) == 0:
            return
        else:
            if not self.job.resources_path.exists():
                os.makedirs(self.job.resources_path, exist_ok=True)

        for asset in self.job.assets:
            asset_path = Path(asset).absolute()
            asset_file = asset_path.as_posix()
            if not asset_path.exists():
                logger.warning(f"local dir {asset_path} not exists, skip to copy")
                continue
            if asset_path.is_dir():
                shutil.copytree(asset_file, self.job.resources_path.as_posix())
            else:
                shutil.copyfile(asset_file, (self.job.resources_path / asset_path.name).as_posix())

[docs]    def run(self, *, independent_tmp=True):
        # prepare data dir
        job_data_dir = Path(self.job.data_dir_path)

        if not job_data_dir.exists():
            os.makedirs(job_data_dir, exist_ok=True)

        self.prepare_assets()

        # create shell file & write to local
        self._write_run_shell_script(self.job.run_file, independent_tmp)

        command = f'sh {self.job.run_file}'

        self._process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
        logger.debug(f'executed command {command} , pid is {self._process.pid}')

[docs]    def status(self):
        if self._process is None:
            return _ShellJob.STATUS_INIT
        process_ret_code = self._process.poll()
        if process_ret_code is None:
            return _ShellJob.STATUS_RUNNING
        if process_ret_code == 0:
            return _ShellJob.STATUS_SUCCEED
        return _ShellJob.STATUS_FAILED

[docs]    def kill(self):
        if self._process is not None:
            try:
                self._process.kill()
                self._process.terminate()
                logger.info(f"kill pid {self._process.pid}")
            except Exception as e:
                logger.exception("kill job exception", e)
            process_ret_code = self._process.poll()
            while process_ret_code is None:
                time.sleep(1)
                process_ret_code = self._process.poll()
            logger.info(f"pid exit with code {process_ret_code}")

        else:
            logger.warning(f"current executor is not running or closed ")

[docs]    def close(self):
        if self._process is not None:
            self._process.terminate()
        else:
            logger.warning(f"current executor is not running or closed ")


[docs]class SSHRemoteMachine:

    def __init__(self, connection, environments=None):

        # Note: requires unix server, does not support windows now
        self.connection = connection
        self.environments = environments

        self._usage = (0, 0, 0)

[docs]    def alloc(self, cpu, ram, gpu):
        cpu_count()
        if self._usage == (0, 0, 0):
            self._usage = (-1, -1, -1)
            return True
        else:
            return False

[docs]    def test_connection(self):
        client = None
        try:
            # check connections
            client = ssh_utils.create_ssh_client(**self.connection)
            assert client
            logger.info(f"connect to host {self.hostname} successfully")
        except Exception as e:
            logger.exception("test connection failed")
        finally:
            if client is not None:
                client.close()

[docs]    @staticmethod
    def total_resources():
        return cpu_count(), 0, 0

[docs]    def release(self, released_usage):
        self._usage = (0, 0, 0)

    @property
    def usage(self):
        return self._usage

    @property
    def hostname(self):
        return self.connection['hostname']


[docs]class RemoteShellExecutor(ShellExecutor):
    def __init__(self, job: _ShellJob, api_server_portal, machine: SSHRemoteMachine):
        super(RemoteShellExecutor, self).__init__(job, api_server_portal, environments=machine.environments)
        self.machine = machine

        self._command_ssh_client = None
        self._remote_process = None

    @property
    def connections(self):
        return self.machine.connection

[docs]    def prepare_assets(self, sftp_client):
        if len(self.job.assets) == 0:
            return

        for asset in self.job.assets:
            asset_path = Path(asset).absolute()
            asset_file = asset_path.as_posix()
            if not asset_path.exists():
                logger.warning(f"local dir {asset_path} not exists, skip to upload")
                continue
            if asset_path.is_dir():
                ssh_utils.upload_dir(sftp_client, asset_file, self.job.resources_path.as_posix())
            else:
                ssh_utils.upload_file(sftp_client, asset_file, (self.job.resources_path / asset_path.name).as_posix())

[docs]    def run(self, *,  independent_tmp=True):
        data_dir = self.job.data_dir_path.as_posix()
        logger.debug(f"create remote data dir {data_dir}")
        ssh_conn = None
        sftp_client = None
        try:
            ssh_conn = ssh_utils.create_ssh_client(**self.connections)

            sftp_client = ssh_conn.open_sftp()
            # with ssh_utils.sftp_client(**self.connections) as sftp_client:
            logger.debug(f"create remote job output dir {data_dir} ")
            ssh_utils.makedirs(sftp_client, data_dir)

            logger.debug(f"prepare to upload assets to {self.machine.hostname}")
            self.prepare_assets(sftp_client)  # share connection

            # copy file to remote
            # ssh_utils.upload_file(sftp_client, run_file, self.job.run_file)
            run_file_content = self._make_run_shell_content(independent_tmp).encode('utf-8')
            ssh_utils.upload_file_obj(sftp_client, io.BytesIO(run_file_content), self.job.run_file)
            logger.debug(f'upload run file to {self.job.run_file}')

            if sftp_client is not None:
                sftp_client.close()

            # execute command in async
            # self._command_ssh_client = ssh_utils.create_ssh_client(**self.connections)
            self._command_ssh_client = ssh_conn

            command = f'sh {self.job.run_file}'
            logger.debug(f'execute command {command}')
            self._remote_process = self._command_ssh_client.exec_command(command, get_pty=True)
        except Exception as e:
            if ssh_conn is not None:
                ssh_conn.close()
            if sftp_client is not None:
                sftp_client.close()
            raise e
        finally:
            pass

[docs]    def status(self):
        if self._remote_process is not None:
            stdout = self._remote_process[1]
            if stdout.channel.exit_status_ready():
                ret_code = stdout.channel.recv_exit_status()
                return _ShellJob.STATUS_SUCCEED if ret_code == 0 else _ShellJob.STATUS_FAILED
            else:
                return _ShellJob.STATUS_RUNNING
        else:
            return _ShellJob.STATUS_INIT

[docs]    def finished(self):
        return self.status() in _ShellJob.FINAL_STATUS

[docs]    def kill(self):
        self.close()

[docs]    def close(self):
        if self._command_ssh_client is not None:
            self._command_ssh_client.close()
        else:
            logger.warning(f"current executor is not running or closed ")


[docs]class ExecutorManager:

    def __init__(self, api_server_portal):
        self.api_server_portal = api_server_portal

        self._waiting_queue = []

[docs]    def allocated_executors(self):
        raise NotImplemented

[docs]    def waiting_executors(self):
        return self._waiting_queue

[docs]    def alloc_executor(self, job):
        raise NotImplemented

[docs]    def kill_executor(self, executor):
        raise NotImplemented

[docs]    def release_executor(self, executor):
        raise NotImplemented

[docs]    def get_executor(self, job):
        for e in self.allocated_executors():
            if e.job == job:
                return e
        return None

[docs]    def prepare(self):
        pass

    # def to_config(self):
    #     raise NotImplemented


[docs]class RemoteSSHExecutorManager(ExecutorManager):

    def __init__(self, api_server_portal, machines: List[SSHRemoteMachine]):
        super(RemoteSSHExecutorManager, self).__init__(api_server_portal)
        self.machines = machines

        self._executors_map = {}

[docs]    def allocated_executors(self):
        return self._executors_map.keys()

[docs]    def alloc_executor(self, job):
        for machine in self.machines:
            if machine.usage == (0, 0, 0):
                ret = machine.alloc(-1, -1, -1)  # lock resource
                if ret:
                    hostname = machine.hostname
                    logger.info(f'allocated resource for {job.name},'
                                f' data dir at: {hostname}{job.data_dir_path}')
                    executor = RemoteShellExecutor(job, self.api_server_portal, machine)

                    job.set_ext({
                        'backend_type': 'remote',
                        'hostname': hostname
                    })
                    # DOT NOT push anything to `_executors_map` or `_waiting_queue` if exception
                    self._executors_map[executor] = machine
                    self._waiting_queue.append(executor)  # put it to queue if ready
                    return executor
        raise NoResourceException

[docs]    def kill_executor(self, executor):
        self.release_executor(executor)

[docs]    def release_executor(self, executor):
        executor.close()  # close connection

        machine = self._executors_map.get(executor)
        logger.debug(f"release resource of machine {machine.hostname} ")

        machine.release(executor.job.resource)
        self._waiting_queue.remove(executor)


[docs]    def prepare(self):
        for machine in self.machines:
            machine.test_connection()


[docs]class LocalExecutorManager(ExecutorManager):

    def __init__(self, api_server_portal, environments=None):
        super(LocalExecutorManager, self).__init__(api_server_portal)
        self.environments = environments

        self._allocated_executors = []
        self._is_busy = False

[docs]    def allocated_executors(self):
        return self._allocated_executors

[docs]    def alloc_executor(self, job):
        if not self._is_busy:
            executor = LocalShellExecutor(job, self.api_server_portal, environments=self.environments)
            self._allocated_executors.append(executor)
            self._waiting_queue.append(executor)
            self._is_busy = True
            logger.info(f'allocated resource job {job.name}, data dir at: localhost{job.data_dir_path}')
            return executor
        raise NoResourceException

[docs]    def release_executor(self, executor):
        self._is_busy = False
        self._waiting_queue.remove(executor)
        executor.close()

[docs]    def kill_executor(self, executor: LocalShellExecutor):
        self.release_executor(executor)
        executor.kill()


[docs]def create_executor_manager(backend_conf, server_host, server_port):  # instance factory
    backend_conf = backend_conf if backend_conf is not None else {}
    server_portal = http_portal(server_host, server_port)
    backend_type = backend_conf.get('type', 'local')

    if backend_type == 'remote':
        machines = [SSHRemoteMachine(**_) for _ in backend_conf['machines']]
        # check remote host setting, only warning for remote backend
        if consts.HOST_LOCALHOST == server_host:
            logger.warning("recommended that set IP address that can be accessed in remote machines, "
                           "but now it's \"localhost\", and the task executed on the remote machines "
                           "may fail because it can't get information from the api server server,"
                           " you can set it in `server.host` ")
        return RemoteSSHExecutorManager(server_portal, machines)
    elif backend_type == 'local':
        return LocalExecutorManager(server_portal, environments=backend_conf.get('environments'))
    else:
        raise ValueError(f"unknown backend {backend_type}")