Source code for dscribe.descriptors.elementaldistribution

# -*- coding: utf-8 -*-
"""Copyright 2019 DScribe developers

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import math
import numpy as np
from scipy.special import erf
from scipy.sparse import lil_matrix
from dscribe.descriptors import Descriptor


[docs]class ElementalDistribution(Descriptor): """Represents a generic distribution on any given grid for any given properties. Can create both continuos and discrete distributions. Continuous distributions require a standard deviation and the number of sampling points. You can also specify the minimum and maximum values for the axis. If these are not specified, a limit is selected based automatically on the values with: min = values.min() - 3*std max = values.max() + 3*std Discrete distributions are assumed to be integer values, and you only need to specify the values. """ def __init__( self, properties, flatten=True, sparse=True, ): """ Args: properties(dict): Contains a description of the elemental property for which a distribution is created. Should contain a dictionary of the following form: properties={ "property_name": { "type": "continuous" "min": <Distribution minimum value> "max": <Distribution maximum value> "std": <Distribution standard deviation> "n": <Number of discrete samples from distribution> "values": { "H": <Value for hydrogen> ... } "property_name2": { "type": "discrete" "values": { "H": <Value for hydrogen> ... } } } } flatten(bool): Whether to flatten out the result. sparse (bool): Whether the output should be a sparse matrix or a dense numpy array. """ # Check that the given properties are valid for prop_name, prop_grid in properties.items(): dist_type = prop_grid.get("type") valid_dist_types = set(["continuous", "discrete"]) if dist_type not in valid_dist_types: raise ValueError( "Please specify the distribution type. Valid options are: {}" .format(valid_dist_types) ) i_min = prop_grid.get("min") i_max = prop_grid.get("max") std = prop_grid.get("std") n = prop_grid.get("n") values = prop_grid.get("values") if values is None: raise ValueError( "Please provide the property values, i.e. a dictionary that" " maps an atomic element symbol to a property value." ) values = np.array(list(values.values())) if dist_type == "continuous": true_min = values.min() true_max = values.max() if i_min is None: i_min = true_min - 3*std prop_grid["min"] = i_min if i_max is None: i_max = true_max + 3*std prop_grid["max"] = i_max if i_min >= i_max: raise ValueError( "Minimum value for '{}' cannot be larger or equal to maximum " "value.".format(prop_name) ) if std <= 0: raise ValueError( "The standard deviation must be a larger than zero." ) if n <= 0: raise ValueError( "The number of grid points must be a non-negative " "integer." ) if true_min < prop_grid["min"]: raise ValueError( "Property value is outside the specified minimum value." ) if true_max > prop_grid["max"]: raise ValueError( "Property value is outside the specified maximum value." ) elif dist_type == "discrete": # Check that all values are integer if not all(np.issubdtype(item, np.integer) for item in values): raise ValueError( "Not all the values given for property '{}' are integer " "numbers.".format(prop_name) ) i_min = values.min() i_max = values.max() prop_grid["min"] = i_min prop_grid["max"] = i_max prop_grid["n"] = i_max - i_min + 1 self.properties = properties
[docs] def get_number_of_features(self): """Used to inquire the final number of features that this descriptor will have. Returns: int: Number of features for this descriptor. """ n_features = 0 for prop in self.properties.values(): n_features += prop["n"] return n_features
[docs] def get_axis(self, property_name): """Used to return the used x-axis for the given property. Args: property_name(str): The property name that was used in the constructor. Returns: np.ndarray: An array of x-axis values. """ prop = self.properties[property_name] minimum = prop["min"] maximum = prop["max"] dist_type = prop["type"] if dist_type == "continuous": n = prop["n"] x = np.linspace(minimum, maximum, n) elif dist_type == "discrete": x = np.arange(minimum, maximum+1) return x
[docs] def create(self, system): """ Args: system (:class:`ase.Atoms` | :class:`.System`): Input system. Returns: scipy.sparse.lil_matrix: The concatenated distributions of the specified properties in a sparse array. """ occurrence = self.get_element_occurrence(system) weights = np.array(list(occurrence.values())) n_features = self.get_number_of_features() distribution = lil_matrix((1, n_features), dtype=np.float32) index = 0 for prop in self.properties.values(): dist_type = prop["type"] if dist_type == "continuous": n = prop["n"] minimum = prop["min"] maximum = prop["max"] std = prop["std"] values = prop["values"] centers = np.array([values[x] for x in occurrence.keys()]) pdf = self.gaussian_sum(centers, weights, minimum, maximum, std, n) distribution[0, index:index+n] += pdf index += n elif dist_type == "discrete": n = prop["n"] values = prop["values"] hist = np.zeros((n)) minimum = prop["min"] for element, occ in occurrence.items(): value = values[element] hist_index = value - minimum hist[hist_index] = occ distribution[0, index:index+n] += hist index += n return distribution
[docs] def gaussian_sum(self, centers, weights, minimum, maximum, std, n): """Calculates a discrete version of a sum of Gaussian distributions. The calculation is done through the cumulative distribution function that is better at keeping the integral of the probability function constant with coarser grids. The values are normalized by dividing with the maximum value of a gaussian with the given standard deviation. Args: centers (1D np.ndarray): The means of the gaussians. weights (1D np.ndarray): The weights for the gaussians. minimum (float): The minimum grid value maximum (float): The maximum grid value std (float): Standard deviation of the gaussian n (int): Number of grid points settings (dict): The grid settings. A dictionary containing the following information: Returns: Value of the gaussian sums on the given grid. """ max_val = 1/(std*math.sqrt(2*math.pi)) dx = (maximum - minimum)/(n-1) x = np.linspace(minimum-dx/2, maximum+dx/2, n+1) pos = x[np.newaxis, :] - centers[:, np.newaxis] y = weights[:, np.newaxis]*1/2*(1 + erf(pos/(std*np.sqrt(2)))) f = np.sum(y, axis=0) f /= max_val f_rolled = np.roll(f, -1) pdf = (f_rolled - f)[0:-1]/dx # PDF is the derivative of CDF return pdf
[docs] def get_element_occurrence(self, system): """Calculate the count of each atomic element in the given system. Args: system (ase.Atoms): The atomic system. Returns: 1D ndarray: The counts for each element in a list where the index of atomic number x is self.atomic_number_to_index[x] """ symbols = system.get_chemical_symbols() unique, counts = np.unique(symbols, return_counts=True) occurrence = dict(zip(unique, counts)) return occurrence