AdsorbML tutorial#
from fairchem.core.common.relaxation.ase_utils import OCPCalculator
import ase.io
from ase.optimize import BFGS
from fairchem.data.oc.core import Adsorbate, AdsorbateSlabConfig, Bulk, Slab
import os
from glob import glob
import pandas as pd
from fairchem.data.oc.utils import DetectTrajAnomaly
from fairchem.data.oc.utils.vasp import write_vasp_input_files
# Optional - see below
import numpy as np
from dscribe.descriptors import SOAP
from scipy.spatial.distance import pdist, squareform
from x3dase.visualize import view_x3d_n
Enumerate the adsorbate-slab configurations to run relaxations on#
AdsorbML incorporates random placement, which is especially useful for more complicated adsorbates which may have many degrees of freedom. I have opted sample a few random placements and a few heuristic. Here I am using *CO on copper (1,1,1) as an example.
bulk_src_id = "mp-30"
adsorbate_smiles = "*CO"
bulk = Bulk(bulk_src_id_from_db = bulk_src_id)
adsorbate = Adsorbate(adsorbate_smiles_from_db=adsorbate_smiles)
slabs = Slab.from_bulk_get_specific_millers(bulk = bulk, specific_millers=(1,1,1))
# There may be multiple slabs with this miller index.
# For demonstrative purposes we will take the first entry.
slab = slabs[0]
Downloading /home/runner/work/fairchem/fairchem/src/fairchem/data/oc/databases/pkls/bulks.pkl...
# Perform heuristic placements
heuristic_adslabs = AdsorbateSlabConfig(slabs[0], adsorbate, mode="heuristic")
# Perform random placements
# (for AdsorbML we use `num_sites = 100` but we will use 4 for brevity here)
random_adslabs = AdsorbateSlabConfig(slabs[0], adsorbate, mode="random_site_heuristic_placement", num_sites = 4)
Run ML relaxations:#
There are 2 options for how to do this.
Using
OCPCalculator
as the calculator within the ASE frameworkBy writing objects to lmdb and relaxing them using
main.py
in the ocp repo
(1) is really only adequate for small stuff and it is what I will show here, but if you plan to run many relaxations, you should definitely use (2). More details about writing lmdbs has been provided here - follow the IS2RS/IS2RE instructions. And more information about running relaxations once the lmdb has been written is here.
You need to provide the calculator with a path to a model checkpoint file. That can be downloaded here
from fairchem.core.common.relaxation.ase_utils import OCPCalculator
from fairchem.core.models.model_registry import model_name_to_local_file
import os
checkpoint_path = model_name_to_local_file('EquiformerV2-31M-S2EF-OC20-All+MD', local_cache='/tmp/fairchem_checkpoints/')
os.makedirs(f"data/{bulk}_{adsorbate}", exist_ok=True)
# Define the calculator
calc = OCPCalculator(checkpoint_path=checkpoint_path) # if you have a gpu, add `cpu=False` to speed up calculations
adslabs = [*heuristic_adslabs.atoms_list, *random_adslabs.atoms_list]
# Set up the calculator
for idx, adslab in enumerate(adslabs):
adslab.calc = calc
opt = BFGS(adslab, trajectory=f"data/{bulk}_{adsorbate}/{idx}.traj")
opt.run(fmax=0.05, steps=100) # For the AdsorbML results we used fmax = 0.02 and steps = 300, but we will use less strict values for brevity.
/home/runner/work/fairchem/fairchem/src/fairchem/core/models/scn/spherical_harmonics.py:23: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
_Jd = torch.load(os.path.join(os.path.dirname(__file__), "Jd.pt"))
/home/runner/work/fairchem/fairchem/src/fairchem/core/models/equiformer_v2/wigner.py:10: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
_Jd = torch.load(os.path.join(os.path.dirname(__file__), "Jd.pt"))
/home/runner/work/fairchem/fairchem/src/fairchem/core/models/escn/so3.py:23: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
_Jd = torch.load(os.path.join(os.path.dirname(__file__), "Jd.pt"))
/home/runner/work/fairchem/fairchem/src/fairchem/core/common/relaxation/ase_utils.py:200: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))
WARNING:root:Detected old config, converting to new format. Consider updating to avoid potential incompatibilities.
INFO:root:amp: true
cmd:
checkpoint_dir: /home/runner/work/fairchem/fairchem/docs/tutorials/checkpoints/2025-03-27-03-48-16
commit: core:6a780ac,experimental:NA
identifier: ''
logs_dir: /home/runner/work/fairchem/fairchem/docs/tutorials/logs/wandb/2025-03-27-03-48-16
print_every: 100
results_dir: /home/runner/work/fairchem/fairchem/docs/tutorials/results/2025-03-27-03-48-16
seed: null
timestamp_id: 2025-03-27-03-48-16
version: 0.1.dev1+g6a780ac
dataset:
format: trajectory_lmdb_v2
grad_target_mean: 0.0
grad_target_std: 2.887317180633545
key_mapping:
force: forces
y: energy
normalize_labels: true
target_mean: -0.7554450631141663
target_std: 2.887317180633545
transforms:
normalizer:
energy:
mean: -0.7554450631141663
stdev: 2.887317180633545
forces:
mean: 0.0
stdev: 2.887317180633545
evaluation_metrics:
metrics:
energy:
- mae
forces:
- forcesx_mae
- forcesy_mae
- forcesz_mae
- mae
- cosine_similarity
- magnitude_error
misc:
- energy_forces_within_threshold
primary_metric: forces_mae
gp_gpus: null
gpus: 0
logger: wandb
loss_functions:
- energy:
coefficient: 4
fn: mae
- forces:
coefficient: 100
fn: l2mae
model:
alpha_drop: 0.1
attn_activation: silu
attn_alpha_channels: 64
attn_hidden_channels: 64
attn_value_channels: 16
distance_function: gaussian
drop_path_rate: 0.1
edge_channels: 128
ffn_activation: silu
ffn_hidden_channels: 128
grid_resolution: 18
lmax_list:
- 4
max_neighbors: 20
max_num_elements: 90
max_radius: 12.0
mmax_list:
- 2
name: equiformer_v2
norm_type: layer_norm_sh
num_distance_basis: 512
num_heads: 8
num_layers: 8
num_sphere_samples: 128
otf_graph: true
proj_drop: 0.0
regress_forces: true
sphere_channels: 128
use_atom_edge_embedding: true
use_gate_act: false
use_grid_mlp: true
use_pbc: true
use_s2_act_attn: false
weight_init: uniform
optim:
batch_size: 8
clip_grad_norm: 100
ema_decay: 0.999
energy_coefficient: 4
eval_batch_size: 8
eval_every: 10000
force_coefficient: 100
grad_accumulation_steps: 1
load_balancing: atoms
loss_energy: mae
loss_force: l2mae
lr_initial: 0.0004
max_epochs: 3
num_workers: 8
optimizer: AdamW
optimizer_params:
weight_decay: 0.001
scheduler: LambdaLR
scheduler_params:
epochs: 1009275
lambda_type: cosine
lr: 0.0004
lr_min_factor: 0.01
warmup_epochs: 3364.25
warmup_factor: 0.2
outputs:
energy:
level: system
forces:
eval_on_free_atoms: true
level: atom
train_on_free_atoms: true
relax_dataset: {}
slurm:
additional_parameters:
constraint: volta32gb
cpus_per_task: 9
folder: /checkpoint/abhshkdz/open-catalyst-project/logs/equiformer_v2/8307793
gpus_per_node: 8
job_id: '8307793'
job_name: eq2s_051701_allmd
mem: 480GB
nodes: 8
ntasks_per_node: 8
partition: learnaccel
time: 4320
task:
dataset: trajectory_lmdb_v2
eval_on_free_atoms: true
grad_input: atomic forces
labels:
- potential energy
primary_metric: forces_mae
train_on_free_atoms: true
test_dataset: {}
trainer: ocp
val_dataset: {}
INFO:root:Loading model: equiformer_v2
WARNING:root:equiformer_v2 (EquiformerV2) class is deprecated in favor of equiformer_v2_backbone_and_heads (EquiformerV2BackboneAndHeads)
INFO:root:Loaded EquiformerV2 with 31058690 parameters.
INFO:root:Loading checkpoint in inference-only mode, not loading keys associated with trainer state!
/home/runner/work/fairchem/fairchem/src/fairchem/core/modules/normalization/normalizer.py:69: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
"mean": torch.tensor(state_dict["mean"]),
WARNING:root:No seed has been set in modelcheckpoint or OCPCalculator! Results may not be reproducible on re-run
Step Time Energy fmax
BFGS: 0 03:47:40 -0.219420 1.055759
BFGS: 1 03:47:43 -0.250954 1.506871
BFGS: 2 03:47:45 -0.342938 1.770245
BFGS: 3 03:47:48 -0.429553 1.501286
BFGS: 4 03:47:51 -0.464051 0.732286
BFGS: 5 03:47:53 -0.483605 0.342824
BFGS: 6 03:47:56 -0.491827 0.242257
BFGS: 7 03:47:58 -0.494074 0.229758
BFGS: 8 03:48:01 -0.511956 0.413290
BFGS: 9 03:48:04 -0.514707 0.199528
BFGS: 10 03:48:06 -0.517306 0.109229
BFGS: 11 03:48:09 -0.518794 0.191672
BFGS: 12 03:48:11 -0.521435 0.280747
BFGS: 13 03:48:14 -0.524155 0.319149
BFGS: 14 03:48:16 -0.526752 0.272179
BFGS: 15 03:48:19 -0.528180 0.216372
BFGS: 16 03:48:22 -0.528516 0.068561
BFGS: 17 03:48:24 -0.529079 0.101672
BFGS: 18 03:48:27 -0.530274 0.128156
BFGS: 19 03:48:29 -0.532347 0.092542
BFGS: 20 03:48:32 -0.533981 0.023639
Step Time Energy fmax
BFGS: 0 03:48:34 -0.355009 1.338747
BFGS: 1 03:48:37 -0.363088 1.668194
BFGS: 2 03:48:39 -0.394823 1.069774
BFGS: 3 03:48:42 -0.502499 0.805577
BFGS: 4 03:48:45 -0.505364 0.240416
BFGS: 5 03:48:47 -0.506131 0.247016
BFGS: 6 03:48:50 -0.507926 0.434834
BFGS: 7 03:48:52 -0.510988 0.441642
BFGS: 8 03:48:55 -0.515407 0.250360
BFGS: 9 03:48:57 -0.517662 0.084743
BFGS: 10 03:49:00 -0.518407 0.183304
BFGS: 11 03:49:03 -0.520567 0.319576
BFGS: 12 03:49:05 -0.522336 0.321716
BFGS: 13 03:49:08 -0.523712 0.176268
BFGS: 14 03:49:10 -0.524876 0.057833
BFGS: 15 03:49:13 -0.525446 0.055125
BFGS: 16 03:49:16 -0.525622 0.073803
BFGS: 17 03:49:18 -0.525549 0.071650
BFGS: 18 03:49:21 -0.525355 0.041408
Step Time Energy fmax
BFGS: 0 03:49:23 -0.432384 1.859380
BFGS: 1 03:49:26 -0.431837 2.276338
BFGS: 2 03:49:28 -0.463512 0.731791
BFGS: 3 03:49:31 -0.482841 0.692695
BFGS: 4 03:49:34 -0.522804 1.213269
BFGS: 5 03:49:36 -0.549422 0.603202
BFGS: 6 03:49:39 -0.552738 0.192523
BFGS: 7 03:49:41 -0.553695 0.154432
BFGS: 8 03:49:44 -0.553749 0.252471
BFGS: 9 03:49:46 -0.556267 0.273960
BFGS: 10 03:49:49 -0.560406 0.195957
BFGS: 11 03:49:52 -0.562428 0.081558
BFGS: 12 03:49:54 -0.562473 0.095610
BFGS: 13 03:49:57 -0.563211 0.208800
BFGS: 14 03:49:59 -0.564372 0.222973
BFGS: 15 03:50:02 -0.565657 0.135709
BFGS: 16 03:50:04 -0.566187 0.085208
BFGS: 17 03:50:07 -0.566944 0.028212
Step Time Energy fmax
BFGS: 0 03:50:09 -0.433580 1.879928
BFGS: 1 03:50:12 -0.433088 2.297147
BFGS: 2 03:50:14 -0.464390 0.737337
BFGS: 3 03:50:17 -0.484699 0.690741
BFGS: 4 03:50:20 -0.525290 1.176131
BFGS: 5 03:50:22 -0.550794 0.546326
BFGS: 6 03:50:25 -0.554122 0.182442
BFGS: 7 03:50:27 -0.554337 0.182682
BFGS: 8 03:50:30 -0.554536 0.281950
BFGS: 9 03:50:32 -0.557285 0.283169
BFGS: 10 03:50:35 -0.560997 0.181483
BFGS: 11 03:50:37 -0.561933 0.072685
BFGS: 12 03:50:40 -0.562460 0.112652
BFGS: 13 03:50:43 -0.562951 0.206033
BFGS: 14 03:50:45 -0.563608 0.214400
BFGS: 15 03:50:48 -0.564846 0.136038
BFGS: 16 03:50:50 -0.565723 0.091919
BFGS: 17 03:50:53 -0.565872 0.033393
Step Time Energy fmax
BFGS: 0 03:50:55 -0.181454 0.828213
BFGS: 1 03:50:58 -0.197180 0.776433
BFGS: 2 03:51:01 -0.316756 2.857461
BFGS: 3 03:51:03 -0.362075 1.547609
BFGS: 4 03:51:06 -0.393180 0.604506
BFGS: 5 03:51:08 -0.400494 0.769766
BFGS: 6 03:51:11 -0.413951 0.590841
BFGS: 7 03:51:13 -0.400006 2.641831
BFGS: 8 03:51:16 -0.445786 0.373455
BFGS: 9 03:51:18 -0.452995 0.210465
BFGS: 10 03:51:21 -0.457523 0.538516
BFGS: 11 03:51:24 -0.460857 0.344750
BFGS: 12 03:51:26 -0.472161 0.239835
BFGS: 13 03:51:29 -0.475793 0.266026
BFGS: 14 03:51:31 -0.478308 0.240736
BFGS: 15 03:51:34 -0.480501 0.292617
BFGS: 16 03:51:36 -0.483398 0.096715
BFGS: 17 03:51:39 -0.485436 0.069094
BFGS: 18 03:51:42 -0.485567 0.123548
BFGS: 19 03:51:44 -0.486611 0.113103
BFGS: 20 03:51:47 -0.487522 0.077732
BFGS: 21 03:51:49 -0.487816 0.040794
Step Time Energy fmax
BFGS: 0 03:51:52 -0.254511 1.036552
BFGS: 1 03:51:54 -0.267340 1.167732
BFGS: 2 03:51:57 -0.330847 1.625289
BFGS: 3 03:51:59 -0.312435 3.613188
BFGS: 4 03:52:02 -0.434104 0.512086
BFGS: 5 03:52:05 -0.451575 0.350621
BFGS: 6 03:52:07 -0.463378 0.430905
BFGS: 7 03:52:10 -0.469389 0.429674
BFGS: 8 03:52:12 -0.491686 0.210755
BFGS: 9 03:52:15 -0.495277 0.239446
BFGS: 10 03:52:17 -0.497211 0.226119
BFGS: 11 03:52:20 -0.499521 0.181920
BFGS: 12 03:52:22 -0.504582 0.317697
BFGS: 13 03:52:25 -0.508579 0.343402
BFGS: 14 03:52:28 -0.513631 0.199759
BFGS: 15 03:52:30 -0.516322 0.153865
BFGS: 16 03:52:33 -0.517030 0.147726
BFGS: 17 03:52:35 -0.515977 0.207737
BFGS: 18 03:52:38 -0.517278 0.191375
BFGS: 19 03:52:40 -0.518914 0.099056
BFGS: 20 03:52:43 -0.519538 0.050027
BFGS: 21 03:52:46 -0.520191 0.060515
BFGS: 22 03:52:48 -0.520410 0.077755
BFGS: 23 03:52:51 -0.520798 0.069332
BFGS: 24 03:52:53 -0.520971 0.054683
BFGS: 25 03:52:56 -0.521239 0.040408
Step Time Energy fmax
BFGS: 0 03:52:58 -0.206576 0.974327
BFGS: 1 03:53:01 -0.236526 1.087665
BFGS: 2 03:53:04 -0.454323 1.600283
BFGS: 3 03:53:06 -0.445683 1.071650
BFGS: 4 03:53:09 -0.468873 0.320767
BFGS: 5 03:53:11 -0.472163 0.301568
BFGS: 6 03:53:14 -0.492092 0.850780
BFGS: 7 03:53:17 -0.498142 0.396648
BFGS: 8 03:53:19 -0.501239 0.089033
BFGS: 9 03:53:22 -0.501253 0.105863
BFGS: 10 03:53:24 -0.502030 0.178658
BFGS: 11 03:53:27 -0.503210 0.220811
BFGS: 12 03:53:29 -0.504625 0.193554
BFGS: 13 03:53:32 -0.507243 0.142544
BFGS: 14 03:53:35 -0.509398 0.073994
BFGS: 15 03:53:37 -0.510039 0.099356
BFGS: 16 03:53:40 -0.511371 0.163741
BFGS: 17 03:53:42 -0.513969 0.168627
BFGS: 18 03:53:45 -0.515905 0.083200
BFGS: 19 03:53:47 -0.516938 0.037139
Step Time Energy fmax
BFGS: 0 03:53:50 -0.184422 0.948035
BFGS: 1 03:53:52 -0.215584 1.312871
BFGS: 2 03:53:55 -0.350099 1.919557
BFGS: 3 03:53:58 -0.309475 3.069838
BFGS: 4 03:54:00 -0.437601 1.001787
BFGS: 5 03:54:03 -0.462046 0.528549
BFGS: 6 03:54:05 -0.477354 0.539126
BFGS: 7 03:54:08 -0.480632 0.377310
BFGS: 8 03:54:11 -0.493089 0.252259
BFGS: 9 03:54:13 -0.503960 0.195030
BFGS: 10 03:54:16 -0.506964 0.269012
BFGS: 11 03:54:18 -0.508931 0.184749
BFGS: 12 03:54:21 -0.510605 0.079177
BFGS: 13 03:54:23 -0.512128 0.173205
BFGS: 14 03:54:26 -0.514551 0.273913
BFGS: 15 03:54:28 -0.517884 0.296763
BFGS: 16 03:54:31 -0.520554 0.261282
BFGS: 17 03:54:34 -0.521403 0.160185
BFGS: 18 03:54:36 -0.521561 0.042192
Parse the trajectories and post-process#
As a post-processing step we check to see if:
the adsorbate desorbed
the adsorbate disassociated
the adsorbate intercalated
the surface has changed
We check these because they effect our referencing scheme and may result in erroneous energies. For (4), the relaxed surface should really be supplied as well. It will be necessary when correcting the SP / RX energies later. Since we don’t have it here, we will ommit supplying it, and the detector will instead compare the initial and final slab from the adsorbate-slab relaxation trajectory. If a relaxed slab is provided, the detector will compare it and the slab after the adsorbate-slab relaxation. The latter is more correct! Note: for the results in the AdsorbML paper, we did not check if the adsorbate was intercalated (is_adsorbate_intercalated()
) because it is a new addition.
# Iterate over trajs to extract results
results = []
for file in glob(f"data/{bulk}_{adsorbate}/*.traj"):
rx_id = file.split("/")[-1].split(".")[0]
traj = ase.io.read(file, ":")
# Check to see if the trajectory is anomolous
initial_atoms = traj[0]
final_atoms = traj[-1]
atom_tags = initial_atoms.get_tags()
detector = DetectTrajAnomaly(initial_atoms, final_atoms, atom_tags)
anom = (
detector.is_adsorbate_dissociated()
or detector.is_adsorbate_desorbed()
or detector.has_surface_changed()
or detector.is_adsorbate_intercalated()
)
rx_energy = traj[-1].get_potential_energy()
results.append({"relaxation_idx": rx_id, "relaxed_atoms": traj[-1],
"relaxed_energy_ml": rx_energy, "anomolous": anom})
df = pd.DataFrame(results)
df
relaxation_idx | relaxed_atoms | relaxed_energy_ml | anomolous | |
---|---|---|---|---|
0 | 1 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.525355 | False |
1 | 3 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.565872 | False |
2 | 4 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.487816 | False |
3 | 7 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.521561 | False |
4 | 5 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.521239 | False |
5 | 0 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.533981 | False |
6 | 6 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.516938 | False |
7 | 2 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.566944 | False |
#scrap anomalies
df = df[~df.anomolous].copy().reset_index()
(Optional) Deduplicate structures#
We may have enumerated very similar structures or structures may have relaxed to the same configuration. For this reason, it is advantageous to cull systems if they are very similar. This results in marginal improvements in the recall metrics we calculated for AdsorbML, so it wasnt implemented there. It is, however, a good way to prevent wasteful VASP calculations. You can also imagine that if we would have enumerated 1000 configs per slab adsorbate combo rather than 100 for AdsorbML, it is more likely that having redundant systems would reduce performance, so its a good thing to keep in mind. This may be done by eye for a small number of systems, but with many systems it is easier to use an automated approach. Here is an example of one such approach, which uses a SOAP descriptor to find similar systems.
# Extract the configs and their energies
def deduplicate(configs_for_deduplication: list,
adsorbate_binding_index: int,
cosine_similarity = 1e-3,
):
"""
A function that may be used to deduplicate similar structures.
Among duplicate entries, the one with the lowest energy will be kept.
Args:
configs_for_deduplication: a list of ML relaxed adsorbate-
surface configurations.
cosine_similarity: The cosine simularity value above which,
configurations are considered duplicate.
Returns:
(list): the indices of configs which should be kept as non-duplicate
"""
energies_for_deduplication = np.array([atoms.get_potential_energy() for atoms in configs_for_deduplication])
# Instantiate the soap descriptor
soap = SOAP(
species=np.unique(configs_for_deduplication[0].get_chemical_symbols()),
r_cut = 2.0,
n_max=6,
l_max=3,
periodic=True,
)
#Figure out which index cooresponds to
ads_len = list(configs_for_deduplication[0].get_tags()).count(2)
position_idx = -1*(ads_len-adsorbate_binding_index)
# Iterate over the systems to get the SOAP vectors
soap_desc = []
for config in configs_for_deduplication:
soap_ex = soap.create(config, centers=[position_idx])
soap_desc.extend(soap_ex)
soap_descs = np.vstack(soap_desc)
#Use euclidean distance to assess similarity
distance = squareform(pdist(soap_descs, metric="cosine"))
bool_matrix = np.where(distance <= cosine_similarity, 1, 0)
# For configs that are found to be similar, just keep the lowest energy one
idxs_to_keep = []
pass_idxs = []
for idx, row in enumerate(bool_matrix):
if idx in pass_idxs:
continue
elif sum(row) == 1:
idxs_to_keep.append(idx)
else:
same_idxs = [row_idx for row_idx, val in enumerate(row) if val == 1]
pass_idxs.extend(same_idxs)
# Pick the one with the lowest energy by ML
min_e = min(energies_for_deduplication[same_idxs])
idxs_to_keep.append(list(energies_for_deduplication).index(min_e))
return idxs_to_keep
configs_for_deduplication = df.relaxed_atoms.tolist()
idxs_to_keep = deduplicate(configs_for_deduplication, adsorbate.binding_indices[0])
# Flip through your configurations to check them out (and make sure deduplication looks good)
print(idxs_to_keep)
view_x3d_n(configs_for_deduplication[2].repeat((2,2,1)))
df = df.iloc[idxs_to_keep]
low_e_values = np.round(df.sort_values(by = "relaxed_energy_ml").relaxed_energy_ml.tolist()[0:5],3)
print(f"The lowest 5 energies are: {low_e_values}")
df
The lowest 5 energies are: [-0.567 -0.534 -0.525 -0.488]
index | relaxation_idx | relaxed_atoms | relaxed_energy_ml | anomolous | |
---|---|---|---|---|---|
0 | 0 | 1 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.525355 | False |
7 | 7 | 2 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.566944 | False |
2 | 2 | 4 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.487816 | False |
5 | 5 | 0 | (Atom('Cu', [-1.3000465215529715, 2.2517466275... | -0.533981 | False |
Write VASP input files#
This assumes you have access to VASP pseudopotentials and the right environment variables configured for ASE. The default VASP flags (which are equivalent to those used to make OC20) are located in ocdata.utils.vasp
. Alternatively, you may pass your own vasp flags to the write_vasp_input_files
function as vasp_flags
.
# Grab the 5 systems with the lowest energy
configs_for_dft = df.sort_values(by = "relaxed_energy_ml").relaxed_atoms.tolist()[0:5]
config_idxs = df.sort_values(by = "relaxed_energy_ml").relaxation_idx.tolist()[0:5]
# Write the inputs
for idx, config in enumerate(configs_for_dft):
os.mkdir(f"data/{config_idxs[idx]}")
write_vasp_input_files(config, outdir = f"data/{config_idxs[idx]}/")