Source code for dscribe.descriptors.elementaldistribution

# -*- coding: utf-8 -*-
"""Copyright 2019 DScribe developers

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import math
import numpy as np
from scipy.special import erf
from scipy.sparse import lil_matrix
from dscribe.descriptors import Descriptor


[docs]class ElementalDistribution(Descriptor):
    """Represents a generic distribution on any given grid for any given
    properties. Can create both continuos and discrete distributions.

    Continuous distributions require a standard deviation and the number of
    sampling points. You can also specify the minimum and maximum values for
    the axis. If these are not specified, a limit is selected based
    automatically on the values with:

        min = values.min() - 3*std
        max = values.max() + 3*std

    Discrete distributions are assumed to be integer values, and you only need
    to specify the values.
    """

    def __init__(
        self,
        properties,
        flatten=True,
        sparse=True,
    ):
        """
        Args:
            properties(dict): Contains a description of the elemental property
                for which a distribution is created. Should contain a dictionary of
                the following form:

                properties={
                    "property_name": {
                        "type": "continuous"
                        "min": <Distribution minimum value>
                        "max": <Distribution maximum value>
                        "std": <Distribution standard deviation>
                        "n": <Number of discrete samples from distribution>
                        "values": {
                            "H": <Value for hydrogen>
                            ...
                        }
                    "property_name2": {
                        "type": "discrete"
                        "values": {
                            "H": <Value for hydrogen>
                            ...
                        }
                    }
                    }
                }
            flatten(bool): Whether to flatten out the result.
            sparse (bool): Whether the output should be a sparse matrix or a
                dense numpy array.
        """
        # Check that the given properties are valid
        for prop_name, prop_grid in properties.items():
            dist_type = prop_grid.get("type")
            valid_dist_types = set(["continuous", "discrete"])
            if dist_type not in valid_dist_types:
                raise ValueError(
                    "Please specify the distribution type. Valid options are: {}".format(
                        valid_dist_types
                    )
                )
            i_min = prop_grid.get("min")
            i_max = prop_grid.get("max")
            std = prop_grid.get("std")
            n = prop_grid.get("n")
            values = prop_grid.get("values")

            if values is None:
                raise ValueError(
                    "Please provide the property values, i.e. a dictionary that"
                    " maps an atomic element symbol to a property value."
                )

            values = np.array(list(values.values()))

            if dist_type == "continuous":
                true_min = values.min()
                true_max = values.max()
                if i_min is None:
                    i_min = true_min - 3 * std
                    prop_grid["min"] = i_min
                if i_max is None:
                    i_max = true_max + 3 * std
                    prop_grid["max"] = i_max
                if i_min >= i_max:
                    raise ValueError(
                        "Minimum value for '{}' cannot be larger or equal to maximum "
                        "value.".format(prop_name)
                    )
                if std <= 0:
                    raise ValueError(
                        "The standard deviation must be a larger than zero."
                    )
                if n <= 0:
                    raise ValueError(
                        "The number of grid points must be a non-negative " "integer."
                    )
                if true_min < prop_grid["min"]:
                    raise ValueError(
                        "Property value is outside the specified minimum value."
                    )
                if true_max > prop_grid["max"]:
                    raise ValueError(
                        "Property value is outside the specified maximum value."
                    )
            elif dist_type == "discrete":

                # Check that all values are integer
                if not all(np.issubdtype(item, np.integer) for item in values):
                    raise ValueError(
                        "Not all the values given for property '{}' are integer "
                        "numbers.".format(prop_name)
                    )
                i_min = values.min()
                i_max = values.max()
                prop_grid["min"] = i_min
                prop_grid["max"] = i_max
                prop_grid["n"] = i_max - i_min + 1

        self.properties = properties

[docs]    def get_number_of_features(self):
        """Used to inquire the final number of features that this descriptor
        will have.

        Returns:
            int: Number of features for this descriptor.
        """
        n_features = 0
        for prop in self.properties.values():
            n_features += prop["n"]

        return n_features

[docs]    def get_axis(self, property_name):
        """Used to return the used x-axis for the given property.

        Args:
            property_name(str): The property name that was used in the
            constructor.

        Returns:
            np.ndarray: An array of x-axis values.
        """
        prop = self.properties[property_name]
        minimum = prop["min"]
        maximum = prop["max"]
        dist_type = prop["type"]
        if dist_type == "continuous":
            n = prop["n"]
            x = np.linspace(minimum, maximum, n)
        elif dist_type == "discrete":
            x = np.arange(minimum, maximum + 1)
        return x

[docs]    def create(self, system):
        """
        Args:
            system (:class:`ase.Atoms` | :class:`.System`): Input system.

        Returns:
            scipy.sparse.lil_matrix: The concatenated distributions of the
                specified properties in a sparse array.
        """
        occurrence = self.get_element_occurrence(system)
        weights = np.array(list(occurrence.values()))
        n_features = self.get_number_of_features()
        distribution = lil_matrix((1, n_features), dtype=np.float32)

        index = 0
        for prop in self.properties.values():
            dist_type = prop["type"]
            if dist_type == "continuous":
                n = prop["n"]
                minimum = prop["min"]
                maximum = prop["max"]
                std = prop["std"]
                values = prop["values"]
                centers = np.array([values[x] for x in occurrence.keys()])
                pdf = self.gaussian_sum(centers, weights, minimum, maximum, std, n)
                distribution[0, index : index + n] += pdf
                index += n
            elif dist_type == "discrete":
                n = prop["n"]
                values = prop["values"]
                hist = np.zeros((n))
                minimum = prop["min"]
                for element, occ in occurrence.items():
                    value = values[element]
                    hist_index = value - minimum
                    hist[hist_index] = occ
                distribution[0, index : index + n] += hist
                index += n

        return distribution

[docs]    def gaussian_sum(self, centers, weights, minimum, maximum, std, n):
        """Calculates a discrete version of a sum of Gaussian distributions.

        The calculation is done through the cumulative distribution function
        that is better at keeping the integral of the probability function
        constant with coarser grids.

        The values are normalized by dividing with the maximum value of a
        gaussian with the given standard deviation.

        Args:
            centers (1D np.ndarray): The means of the gaussians.
            weights (1D np.ndarray): The weights for the gaussians.
            minimum (float): The minimum grid value
            maximum (float): The maximum grid value
            std (float): Standard deviation of the gaussian
            n (int): Number of grid points
            settings (dict): The grid settings. A dictionary
                containing the following information:

        Returns:
            Value of the gaussian sums on the given grid.
        """
        max_val = 1 / (std * math.sqrt(2 * math.pi))

        dx = (maximum - minimum) / (n - 1)
        x = np.linspace(minimum - dx / 2, maximum + dx / 2, n + 1)
        pos = x[np.newaxis, :] - centers[:, np.newaxis]
        y = weights[:, np.newaxis] * 1 / 2 * (1 + erf(pos / (std * np.sqrt(2))))
        f = np.sum(y, axis=0)
        f /= max_val
        f_rolled = np.roll(f, -1)
        pdf = (f_rolled - f)[0:-1] / dx  # PDF is the derivative of CDF

        return pdf

[docs]    def get_element_occurrence(self, system):
        """Calculate the count of each atomic element in the given system.

        Args:
            system (ase.Atoms): The atomic system.

        Returns:
            1D ndarray: The counts for each element in a list where the index
            of atomic number x is self.atomic_number_to_index[x]
        """
        symbols = system.get_chemical_symbols()
        unique, counts = np.unique(symbols, return_counts=True)
        occurrence = dict(zip(unique, counts))

        return occurrence