Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE REQUEST] SkipAtom compositional featurizer #854

Open
sgbaird opened this issue Aug 6, 2022 · 0 comments
Open

[FEATURE REQUEST] SkipAtom compositional featurizer #854

sgbaird opened this issue Aug 6, 2022 · 0 comments

Comments

@sgbaird
Copy link

sgbaird commented Aug 6, 2022

https://github.com/lantunes/skipatom

Intended usage:

from pymatgen.core.composition import Composition
compositions = [Composition("Al2O3"), Composition("CeCoGe3")]
from matminer.featurizers.composition.composite import ElementProperty
ep = ElementProperty.from_preset("skipatom")
comp_fingerprints = ep.featurize_many(compositions)

I think the following would be a good example to follow:

class MatscholarElementData(AbstractData):
"""
Class to get word embedding vectors of elements. These word embeddings were
generated using NLP + Neural Network techniques on more than 3 million
scientific abstracts.
The data returned by this class are simply learned representations of the
elements, taken from:
Tshitoyan, V., Dagdelen, J., Weston, L. et al. Unsupervised word embeddings
capture latent knowledge from materials science literature. Nature 571,
95–98 (2019). https://doi.org/10.1038/s41586-019-1335-8
"""
def __init__(self):
dfile = os.path.join(module_dir, "data_files/matscholar_els.json")
with open(dfile) as fp:
embeddings = json.load(fp)
self.prop_names = [f"embedding {i}" for i in range(1, 201)]
all_element_data = {}
for el, embedding in embeddings.items():
all_element_data[el] = dict(zip(self.prop_names, embedding))
self.all_element_data = all_element_data
def get_elemental_property(self, elem, property_name):
return self.all_element_data[str(elem)][property_name]

Followed by the appropriate incorporation into:

class ElementProperty(BaseFeaturizer):
"""
Class to calculate elemental property attributes.
To initialize quickly, use the from_preset() method.
Features: Based on the statistics of the data_source chosen, computed
by element stoichiometry. The format generally is:
"{data source} {statistic} {property}"
For example:
"PymetgenData range X" # Range of electronegativity from Pymatgen data
For a list of all statistics, see the PropertyStats documentation; for a
list of all attributes available for a given data_source, see the
documentation for the data sources (e.g., PymatgenData, MagpieData,
MatscholarElementData, etc.).
Args:
data_source (AbstractData or str): source from which to retrieve
element property data (or use str for preset: "pymatgen",
"magpie", or "deml")
features (list of strings): List of elemental properties to use
(these must be supported by data_source)
stats (list of strings): a list of weighted statistics to compute to for each
property (see PropertyStats for available stats)
"""
def __init__(self, data_source, features, stats):
if data_source == "pymatgen":
self.data_source = PymatgenData()
elif data_source == "magpie":
self.data_source = MagpieData()
elif data_source == "deml":
self.data_source = DemlData()
elif data_source == "matscholar_el":
self.data_source = MatscholarElementData()
elif data_source == "megnet_el":
self.data_source = MEGNetElementData()
else:
self.data_source = data_source
self.features = features
self.stats = stats
# Initialize stats computer
self.pstats = PropertyStats()
@classmethod
def from_preset(cls, preset_name):
"""
Return ElementProperty from a preset string
Args:
preset_name: (str) can be one of "magpie", "deml", "matminer",
"matscholar_el", or "megnet_el".
Returns:
ElementProperty based on the preset name.
"""
if preset_name == "magpie":
data_source = "magpie"
features = [
"Number",
"MendeleevNumber",
"AtomicWeight",
"MeltingT",
"Column",
"Row",
"CovalentRadius",
"Electronegativity",
"NsValence",
"NpValence",
"NdValence",
"NfValence",
"NValence",
"NsUnfilled",
"NpUnfilled",
"NdUnfilled",
"NfUnfilled",
"NUnfilled",
"GSvolume_pa",
"GSbandgap",
"GSmagmom",
"SpaceGroupNumber",
]
stats = ["minimum", "maximum", "range", "mean", "avg_dev", "mode"]
elif preset_name == "deml":
data_source = "deml"
stats = ["minimum", "maximum", "range", "mean", "std_dev"]
features = [
"atom_num",
"atom_mass",
"row_num",
"col_num",
"atom_radius",
"molar_vol",
"heat_fusion",
"melting_point",
"boiling_point",
"heat_cap",
"first_ioniz",
"electronegativity",
"electric_pol",
"GGAU_Etot",
"mus_fere",
"FERE correction",
]
elif preset_name == "matminer":
data_source = "pymatgen"
stats = ["minimum", "maximum", "range", "mean", "std_dev"]
features = [
"X",
"row",
"group",
"block",
"atomic_mass",
"atomic_radius",
"mendeleev_no",
"electrical_resistivity",
"velocity_of_sound",
"thermal_conductivity",
"melting_point",
"bulk_modulus",
"coefficient_of_linear_thermal_expansion",
]
elif preset_name == "matscholar_el":
data_source = "matscholar_el"
stats = ["minimum", "maximum", "range", "mean", "std_dev"]
features = MatscholarElementData().prop_names
elif preset_name == "megnet_el":
data_source = "megnet_el"
stats = ["minimum", "maximum", "range", "mean", "std_dev"]
features = MEGNetElementData().prop_names
else:
raise ValueError("Invalid preset_name specified!")
return cls(data_source, features, stats)
def featurize(self, comp):
"""
Get elemental property attributes
Args:
comp: Pymatgen composition object
Returns:
all_attributes: Specified property statistics of features
"""
all_attributes = []
# Get the element names and fractions
elements, fractions = zip(*comp.element_composition.items())
for attr in self.features:
elem_data = [self.data_source.get_elemental_property(e, attr) for e in elements]
for stat in self.stats:
all_attributes.append(self.pstats.calc_stat(elem_data, stat, fractions))
return all_attributes
def feature_labels(self):
labels = []
for attr in self.features:
src = self.data_source.__class__.__name__
for stat in self.stats:
labels.append(f"{src} {stat} {attr}")
return labels
def citations(self):
if self.data_source.__class__.__name__ == "MagpieData":
citation = [
"@article{ward_agrawal_choudary_wolverton_2016, title={A general-purpose "
"machine learning framework for predicting properties of inorganic materials}, "
"volume={2}, DOI={10.1038/npjcompumats.2017.28}, number={1}, journal={npj "
"Computational Materials}, author={Ward, Logan and Agrawal, Ankit and Choudhary, "
"Alok and Wolverton, Christopher}, year={2016}}"
]
elif self.data_source.__class__.__name__ == "DemlData":
citation = [
"@article{deml_ohayre_wolverton_stevanovic_2016, title={Predicting density "
"functional theory total energies and enthalpies of formation of metal-nonmetal "
"compounds by linear regression}, volume={47}, DOI={10.1002/chin.201644254}, "
"number={44}, journal={ChemInform}, author={Deml, Ann M. and Ohayre, Ryan and "
"Wolverton, Chris and Stevanovic, Vladan}, year={2016}}"
]
elif self.data_source.__class__.__name__ == "PymatgenData":
citation = [
"@article{Ong2013, author = {Ong, Shyue Ping and Richards, William Davidson and Jain, Anubhav and Hautier, "
"Geoffroy and Kocher, Michael and Cholia, Shreyas and Gunter, Dan and Chevrier, Vincent L. and Persson, "
"Kristin A. and Ceder, Gerbrand}, doi = {10.1016/j.commatsci.2012.10.028}, issn = {09270256}, "
"journal = {Computational Materials Science}, month = {feb}, pages = {314--319}, "
"publisher = {Elsevier B.V.}, title = {{Python Materials Genomics (pymatgen): A robust, open-source python "
"library for materials analysis}}, url = {http://linkinghub.elsevier.com/retrieve/pii/S0927025612006295}, "
"volume = {68}, year = {2013} } "
]
elif self.data_source.__class__.__name__ == "MEGNetElementData":
# TODO: Cite MEGNet publication (not preprint) once released!
citation = [
"@ARTICLE{2018arXiv181205055C,"
"author = {{Chen}, Chi and {Ye}, Weike and {Zuo}, Yunxing and {Zheng}, Chen and {Ong}, Shyue Ping},"
"title = '{Graph Networks as a Universal Machine Learning Framework for Molecules and Crystals}',"
"journal = {arXiv e-prints},"
"keywords = {Condensed Matter - Materials Science, Physics - Computational Physics},"
"year = '2018',"
"month = 'Dec',"
"eid = {arXiv:1812.05055},"
"pages = {arXiv:1812.05055},"
"archivePrefix = {arXiv},"
"eprint = {1812.05055},"
"primaryClass = {cond-mat.mtrl-sci},"
r"adsurl = {https://ui.adsabs.harvard.edu/\#abs/2018arXiv181205055C},"
"adsnote = {Provided by the SAO/NASA Astrophysics Data System}}"
]
else:
citation = []
return citation
def implementors(self):
return ["Jiming Chen", "Logan Ward", "Anubhav Jain", "Alex Dunn"]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant