Note
Go to the end to download the full example code
Relationship between error and latent distance
Distances in latent space does not provide an error estimate in the units of the property being predicted. So we calibrate the error estimate by ftting the predictive variance to a simple conditional Gaussian distribution of the error similar to A quantitative uncertainty metric controls error in neural network-driven chemical discovery
Here we took carbon for example:
from cProfile import label
from pynep.io import load_nep
from pynep.calculate import NEP
import numpy as np
from scipy.spatial.distance import cdist
from scipy.optimize import minimize
import matplotlib.pyplot as plt
calc = NEP('C_2022_NEP3.txt')
train_data = load_nep('train.in')
test_data = load_nep('test.in')
train_lat = np.concatenate([calc.get_property('latent', atoms) for atoms in train_data])
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
2 0
48 0
100 0
36 0
64 0
125 0
64 0
125 0
64 0
64 0
27 0
36 0
36 0
48 0
64 0
27 0
27 0
64 0
125 0
64 0
48 0
64 0
125 0
36 0
64 0
48 0
64 0
64 0
64 0
125 0
100 0
48 0
125 0
64 0
64 0
80 0
48 0
48 0
64 0
64 0
27 0
125 0
64 0
64 0
64 0
100 0
27 0
64 0
48 0
125 0
64 0
64 0
125 0
64 0
64 0
64 0
125 0
64 0
64 0
125 0
64 0
27 0
64 0
64 0
27 0
36 0
64 0
36 0
64 0
48 0
48 0
64 0
64 0
64 0
48 0
100 0
125 0
64 0
64 0
36 0
27 0
64 0
125 0
125 0
48 0
64 0
27 0
125 0
48 0
36 0
80 0
64 0
48 0
64 0
64 0
64 0
48 0
36 0
48 0
80 0
64 0
64 0
64 0
64 0
64 0
48 0
64 0
64 0
125 0
64 0
64 0
48 0
125 0
48 0
64 0
64 0
64 0
64 0
64 0
64 0
36 0
48 0
64 0
64 0
64 0
125 0
125 0
125 0
64 0
27 0
125 0
64 0
36 0
36 0
100 0
36 0
64 0
64 0
27 0
64 0
36 0
64 0
48 0
36 0
64 0
64 0
64 0
27 0
64 0
64 0
64 0
64 0
80 0
48 0
36 0
80 0
125 0
48 0
36 0
36 0
64 0
125 0
64 0
100 0
27 0
64 0
64 0
64 0
64 0
64 0
36 0
27 0
64 0
27 0
48 0
64 0
64 0
27 0
125 0
64 0
125 0
125 0
64 0
125 0
27 0
64 0
64 0
125 0
64 0
64 0
64 0
64 0
48 0
36 0
64 0
64 0
125 0
64 0
27 0
64 0
125 0
64 0
64 0
27 0
48 0
64 0
64 0
27 0
64 0
125 0
48 0
125 0
64 0
64 0
36 0
64 0
27 0
64 0
64 0
27 0
48 0
48 0
27 0
48 0
27 0
27 0
64 0
64 0
48 0
64 0
36 0
27 0
125 0
27 0
64 0
36 0
27 0
125 0
100 0
64 0
64 0
64 0
64 0
64 0
64 0
36 0
27 0
125 0
48 0
36 0
125 0
64 0
64 0
27 0
64 0
64 0
64 0
27 0
64 0
64 0
64 0
125 0
27 0
64 0
36 0
64 0
64 0
125 0
48 0
64 0
64 0
64 0
64 0
36 0
64 0
64 0
64 0
125 0
64 0
125 0
64 0
64 0
48 0
27 0
48 0
125 0
64 0
64 0
125 0
64 0
64 0
64 0
125 0
48 0
125 0
27 0
80 0
64 0
64 0
125 0
64 0
36 0
64 0
64 0
64 0
125 0
64 0
64 0
64 0
36 0
36 0
48 0
64 0
125 0
64 0
64 0
48 0
125 0
64 0
64 0
125 0
100 0
125 0
48 0
36 0
48 0
64 0
64 0
125 0
64 0
125 0
80 0
125 0
64 0
64 0
64 0
48 0
64 0
125 0
125 0
64 0
48 0
64 0
125 0
48 0
64 0
64 0
64 0
48 0
64 0
125 0
64 0
64 0
27 0
64 0
64 0
100 0
64 0
48 0
64 0
36 0
64 0
64 0
48 0
125 0
80 0
64 0
64 0
64 0
100 0
64 0
64 0
64 0
36 0
64 0
125 0
125 0
27 0
64 0
48 0
64 0
64 0
64 0
48 0
64 0
36 0
64 0
36 0
64 0
125 0
64 0
36 0
64 0
125 0
64 0
64 0
36 0
125 0
80 0
64 0
48 0
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
48 1
36 1
27 1
48 1
48 1
36 1
64 1
64 1
64 1
64 1
27 1
64 1
64 1
64 1
27 1
27 1
27 1
27 1
36 1
36 1
27 1
48 1
64 1
64 1
36 1
64 1
64 1
48 1
27 1
48 1
64 1
48 1
36 1
36 1
64 1
64 1
64 1
27 1
36 1
27 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
100 1
125 1
125 1
80 1
64 1
80 1
100 1
125 1
125 1
100 1
80 1
125 1
80 1
80 1
64 1
125 1
64 1
125 1
125 1
64 1
64 1
100 1
100 1
80 1
100 1
100 1
64 1
64 1
64 1
27 1
27 1
48 1
48 1
64 1
36 1
36 1
64 1
36 1
48 1
48 1
48 1
64 1
48 1
48 1
64 1
27 1
48 1
64 1
48 1
48 1
48 1
64 1
27 1
36 1
36 1
64 1
36 1
27 1
27 1
36 1
64 1
36 1
48 1
48 1
64 1
48 1
48 1
64 1
36 1
48 1
36 1
36 1
27 1
48 1
27 1
48 1
64 1
27 1
64 1
27 1
36 1
64 1
64 1
64 1
36 1
27 1
48 1
36 1
48 1
27 1
36 1
36 1
48 1
48 1
64 1
36 1
48 1
27 1
64 1
27 1
48 1
27 1
48 1
64 1
36 1
48 1
48 1
64 1
27 1
64 1
48 1
64 1
27 1
27 1
27 1
27 1
48 1
48 1
48 1
64 1
48 1
36 1
48 1
27 1
27 1
48 1
27 1
48 1
64 1
48 1
27 1
36 1
48 1
27 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
125 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
8 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
4 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
216 1
216 1
216 1
216 1
216 1
216 1
216 1
216 1
216 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
64 1
We use 10% testset to calculate the force error and the min distance to train data of each atoms
def split_dataset(dataset, ratio=0.1):
indices = np.arange(len(dataset))
np.random.shuffle(indices)
num = int(len(dataset) * ratio)
return dataset[:num], dataset[num:]
d1, d2 = split_dataset(test_data)
d, e = [], []
for atoms in d1:
nep_forces = calc.get_forces(atoms)
dft_forces = atoms.info['forces']
error = np.sqrt(np.sum((nep_forces - dft_forces) ** 2, axis=1)).tolist()
lat = calc.get_property('latent', atoms)
mind = np.min(cdist(lat, train_lat), axis=1).tolist()
d.extend(mind)
e.extend(error)
d = np.array(d) # min distances to trainset
e = np.array(e) # errors between nep and dft
Here we suppose \(\sigma = a + b \cdot d + c \cdot d^2\), and a, b, c > 0. Maximize \(\mathcal{N}(\epsilon \vert 0, \sigma)\) is same to minimize \(2 \cdot ln(\sigma) + d^2/\sigma^2\)
def loss(args):
a, b, c = args
sigma = np.abs(a + b * d + c * d ** 2)
l = np.sum(2 * np.log(sigma) + (e / sigma) ** 2)
return l
cons = (
{'type': 'ineq', 'fun': lambda x: x - 0.0001},
)
x0 = np.array([1, 1, 1])
res = minimize(loss, x0, constraints=cons, method='SLSQP')
results
d_, e_ = [], []
for atoms in d2[::10]:
nep_forces = calc.get_forces(atoms)
dft_forces = atoms.info['forces']
error = np.sqrt(np.sum((nep_forces - dft_forces) ** 2, axis=1)).tolist()
lat = calc.get_property('latent', atoms)
mind = np.min(cdist(lat, train_lat), axis=1).tolist()
d_.extend(mind)
e_.extend(error)
d_ = np.array(d_)
e_ = np.array(e_)
sigma_ = res.x[0] + res.x[1] * d_ + res.x[2] * d_ ** 2
s1 = len(e_[e_ < sigma_]) / len(e_)
s2 = len(e_[e_ < 2 * sigma_]) / len(e_)
plt.scatter(d_, e_, s=4)
dd = np.linspace(0, max(d_) + 0.1, 100)
sigma = res.x[0] + res.x[1] * dd + res.x[2] * dd ** 2
plt.fill_between(dd, sigma, facecolor='blue', alpha=0.3,
label="{:.2%}".format(s1))
plt.fill_between(dd, 2 * sigma, sigma, facecolor='yellow', alpha=0.3,
label="{:.2%}".format(s2 - s1))
plt.legend()

<matplotlib.legend.Legend object at 0x7ff24b8c1710>
Total running time of the script: ( 0 minutes 17.767 seconds)