core.launchers.api#

Copyright (c) Meta Platforms, Inc. and affiliates.

This source code is licensed under the MIT license found in the LICENSE file in the root directory of this source tree.

Attributes#

Classes#

SchedulerType

Enum where members are also (and must be) strings

DeviceType

Enum where members are also (and must be) strings

RunType

Enum where members are also (and must be) strings

DistributedInitMethod

Enum where members are also (and must be) strings

SlurmConfig

RayClusterConfig

SchedulerConfig

SlurmEnv

Metadata

JobConfig

Module Contents#

core.launchers.api.ALLOWED_TOP_LEVEL_KEYS#
core.launchers.api.LOG_DIR_NAME = 'logs'#
core.launchers.api.CHECKPOINT_DIR_NAME = 'checkpoints'#
core.launchers.api.RESULTS_DIR = 'results'#
core.launchers.api.CONFIG_FILE_NAME = 'canonical_config.yaml'#
core.launchers.api.PREEMPTION_STATE_DIR_NAME = 'preemption_state'#
class core.launchers.api.SchedulerType#

Bases: fairchem.core.common.utils.StrEnum

Enum where members are also (and must be) strings

LOCAL = 'local'#
SLURM = 'slurm'#
class core.launchers.api.DeviceType#

Bases: fairchem.core.common.utils.StrEnum

Enum where members are also (and must be) strings

CPU = 'cpu'#
CUDA = 'cuda'#
class core.launchers.api.RunType#

Bases: fairchem.core.common.utils.StrEnum

Enum where members are also (and must be) strings

RUN = 'run'#
REDUCE = 'reduce'#
class core.launchers.api.DistributedInitMethod#

Bases: fairchem.core.common.utils.StrEnum

Enum where members are also (and must be) strings

TCP = 'tcp'#
FILE = 'file'#
class core.launchers.api.SlurmConfig#
mem_gb: int = 80#
timeout_hr: int = 168#
cpus_per_task: int = 8#
partition: str | None = None#
qos: str | None = None#
account: str | None = None#
additional_parameters: dict | None = None#
class core.launchers.api.RayClusterConfig#
head_gpus: int = 0#
class core.launchers.api.SchedulerConfig#
mode: SchedulerType#
distributed_init_method: DistributedInitMethod#
ranks_per_node: int = 1#
num_nodes: int = 1#
num_array_jobs: int = 1#
slurm: SlurmConfig#
use_ray: bool = False#
ray_cluster: RayClusterConfig#
class core.launchers.api.SlurmEnv#
job_id: str | None = None#
raw_job_id: str | None = None#
array_job_id: str | None = None#
array_task_id: str | None = None#
restart_count: str | None = None#
class core.launchers.api.Metadata#
commit: str#
log_dir: str#
checkpoint_dir: str#
results_dir: str#
config_path: str#
preemption_checkpoint_dir: str#
cluster_name: str#
array_job_num: int = 0#
slurm_env: SlurmEnv#
class core.launchers.api.JobConfig#
run_name: str#
timestamp_id: str#
run_dir: str#
device_type: DeviceType#
debug: bool = False#
scheduler: SchedulerConfig#
logger: dict | None = None#
seed: int = 0#
deterministic: bool = False#
runner_state_path: str | None = None#
metadata: Metadata | None = None#
graph_parallel_group_size: int | None = None#
__post_init__() None#