{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Relationship between error and latent distance\n\nDistances in latent space does not provide an error estimate in the units of the \nproperty being predicted. So we calibrate the error estimate by ftting the predictive variance to \na simple conditional Gaussian distribution of the error similar to \n[A quantitative uncertainty metric controls error in neural network-driven chemical discovery](https://doi.org/10.1039/C9SC02298H)\n\nHere we took carbon for example:\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "from cProfile import label\nfrom pynep.io import load_nep\nfrom pynep.calculate import NEP\nimport numpy as np\nfrom scipy.spatial.distance import cdist\nfrom scipy.optimize import minimize\nimport matplotlib.pyplot as plt\n\n\ncalc = NEP('C_2022_NEP3.txt')\ntrain_data = load_nep('train.in')\ntest_data = load_nep('test.in')\ntrain_lat = np.concatenate([calc.get_property('latent', atoms) for atoms in train_data])"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We use 10% testset to calculate the force error and the min distance to train data of each atoms\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "def split_dataset(dataset, ratio=0.1):\n    indices = np.arange(len(dataset))\n    np.random.shuffle(indices)\n    num = int(len(dataset) * ratio)\n    return dataset[:num], dataset[num:]\nd1, d2 = split_dataset(test_data)\n\nd, e = [], []\nfor atoms in d1:\n    nep_forces = calc.get_forces(atoms)\n    dft_forces = atoms.info['forces']\n    error = np.sqrt(np.sum((nep_forces - dft_forces) ** 2, axis=1)).tolist()\n    lat = calc.get_property('latent', atoms)\n    mind = np.min(cdist(lat, train_lat), axis=1).tolist()\n    d.extend(mind)\n    e.extend(error)\nd = np.array(d)           # min distances to trainset\ne = np.array(e)           # errors between nep and dft"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Here we suppose $\\sigma = a + b \\cdot d + c \\cdot d^2$, and a, b, c > 0. \nMaximize $\\mathcal{N}(\\epsilon \\vert 0, \\sigma)$ is same to minimize \n$2 \\cdot ln(\\sigma) + d^2/\\sigma^2$ \n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "def loss(args):\n    a, b, c = args\n    sigma = np.abs(a + b * d + c * d ** 2)\n    l = np.sum(2 * np.log(sigma) + (e / sigma) ** 2)\n    return l\n\ncons = (\n    {'type': 'ineq', 'fun': lambda x: x - 0.0001},\n)\n\nx0 = np.array([1, 1, 1])\nres = minimize(loss, x0, constraints=cons, method='SLSQP')"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "results\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "d_, e_ = [], []\nfor atoms in d2[::10]:\n    nep_forces = calc.get_forces(atoms)\n    dft_forces = atoms.info['forces']\n    error = np.sqrt(np.sum((nep_forces - dft_forces) ** 2, axis=1)).tolist()\n\n    lat = calc.get_property('latent', atoms)\n    mind = np.min(cdist(lat, train_lat), axis=1).tolist()\n\n    d_.extend(mind)\n    e_.extend(error)\n\n\nd_ = np.array(d_)\ne_ = np.array(e_)\nsigma_ = res.x[0] + res.x[1] * d_ + res.x[2] * d_ ** 2\ns1 = len(e_[e_ < sigma_]) / len(e_)\ns2 = len(e_[e_ < 2 * sigma_]) / len(e_)\nplt.scatter(d_, e_, s=4)\n\ndd = np.linspace(0, max(d_) + 0.1, 100)\nsigma = res.x[0] + res.x[1] * dd + res.x[2] * dd ** 2\nplt.fill_between(dd, sigma, facecolor='blue', alpha=0.3,\n                 label=\"{:.2%}\".format(s1))\nplt.fill_between(dd, 2 * sigma, sigma, facecolor='yellow', alpha=0.3,\n                 label=\"{:.2%}\".format(s2 - s1))\nplt.legend()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.9"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}