Source code for pymatgen.matproj.snl

# coding: utf-8
# Copyright (c) Pymatgen Development Team.
# Distributed under the terms of the MIT License.

from __future__ import unicode_literals

"""
Classes and methods related to the Structure Notation Language (SNL)
"""

__author__ = 'Anubhav Jain, Shyue Ping Ong'
__credits__ = 'Dan Gunter'
__copyright__ = 'Copyright 2013, The Materials Project'
__version__ = '0.1'
__maintainer__ = 'Anubhav Jain'
__email__ = 'ajain@lbl.gov'
__date__ = 'Feb 11, 2013'

import sys
import re
import datetime
from collections import namedtuple
import json

from six.moves import map, cStringIO
from six import string_types

from monty.json import MontyDecoder, MontyEncoder
from monty.string import remove_non_ascii

from pymatgen.core.structure import Structure, Molecule
from pybtex.database.input import bibtex
from pybtex import errors


MAX_HNODE_SIZE = 64000  # maximum size (bytes) of SNL HistoryNode
MAX_DATA_SIZE = 256000  # maximum size (bytes) of SNL data field
MAX_HNODES = 100  # maximum number of HistoryNodes in SNL file
MAX_BIBTEX_CHARS = 20000  # maximum number of characters for BibTeX reference


[docs]def is_valid_bibtex(reference): """ Use pybtex to validate that a reference is in proper BibTeX format Args: reference: A String reference in BibTeX format. Returns: Boolean indicating if reference is valid bibtex. """ # str is necessary since pybtex seems to have an issue with unicode. The # filter expression removes all non-ASCII characters. sio = cStringIO(remove_non_ascii(reference)) parser = bibtex.Parser() errors.set_strict_mode(False) bib_data = parser.parse_stream(sio) return len(bib_data.entries) > 0
[docs]class HistoryNode(namedtuple('HistoryNode', ['name', 'url', 'description'])): """ A HistoryNode represents a step in the chain of events that lead to a Structure. HistoryNodes leave 'breadcrumbs' so that you can trace back how a Structure was created. For example, a HistoryNode might represent pulling a Structure from an external database such as the ICSD or CSD. Or, it might represent the application of a code (e.g. pymatgen) to the Structure, with a custom description of how that code was applied (e.g. a site removal Transformation was applied). A HistoryNode contains three fields: .. attribute:: name The name of a code or resource that this Structure encountered in its history (String) .. attribute:: url The URL of that code/resource (String) .. attribute:: description A free-form description of how the code/resource is related to the Structure (dict). """
[docs] def as_dict(self): return {"name": self.name, "url": self.url, "description": self.description}
[docs] @staticmethod def from_dict(h_node): return HistoryNode(h_node['name'], h_node['url'], h_node['description'])
[docs] @staticmethod def parse_history_node(h_node): """ Parses a History Node object from either a dict or a tuple. Args: h_node: A dict with name/url/description fields or a 3-element tuple. Returns: History node. """ if isinstance(h_node, dict): return HistoryNode.from_dict(h_node) else: if len(h_node) != 3: raise ValueError("Invalid History node, " "should be dict or (name, version, " "description) tuple: {}".format(h_node)) return HistoryNode(h_node[0], h_node[1], h_node[2])
[docs]class Author(namedtuple('Author', ['name', 'email'])): """ An Author contains two fields: .. attribute:: name Name of author (String) .. attribute:: email Email of author (String) """ def __str__(self): """ String representation of an Author """ return '{} <{}>'.format(self.name, self.email)
[docs] def as_dict(self): return {"name": self.name, "email": self.email}
[docs] @staticmethod def from_dict(d): return Author(d['name'], d['email'])
[docs] @staticmethod def parse_author(author): """ Parses an Author object from either a String, dict, or tuple Args: author: A String formatted as "NAME <email@domain.com>", (name, email) tuple, or a dict with name and email keys. Returns: An Author object. """ if isinstance(author, string_types): # Regex looks for whitespace, (any name), whitespace, <, (email), # >, whitespace m = re.match(r'\s*(.*?)\s*<(.*?@.*?)>\s*', author) if not m or m.start() != 0 or m.end() != len(author): raise ValueError("Invalid author format! {}".format(author)) return Author(m.groups()[0], m.groups()[1]) elif isinstance(author, dict): return Author.from_dict(author) else: if len(author) != 2: raise ValueError("Invalid author, should be String or (name, " "email) tuple: {}".format(author)) return Author(author[0], author[1])
[docs]class StructureNL(object): """ The Structure Notation Language (SNL, pronounced 'snail') is container for a pymatgen Structure/Molecule object with some additional fields for enhanced provenance. It is meant to be imported/exported in a JSON file format with the following structure: - about - created_at - authors - projects - references - remarks - data - history - lattice (optional) - sites Args: struct_or_mol: A pymatgen.core.structure Structure/Molecule object authors: *List* of {"name":'', "email":''} dicts, *list* of Strings as 'John Doe <johndoe@gmail.com>', or a single String with commas separating authors projects: List of Strings ['Project A', 'Project B'] references: A String in BibTeX format remarks: List of Strings ['Remark A', 'Remark B'] data: A free form dict. Namespaced at the root level with an underscore, e.g. {"_materialsproject": <custom data>} history: List of dicts - [{'name':'', 'url':'', 'description':{}}] created_at: A datetime object """ def __init__(self, struct_or_mol, authors, projects=None, references='', remarks=None, data=None, history=None, created_at=None): # initialize root-level structure keys self.structure = struct_or_mol # turn authors into list of Author objects authors = authors.split(',')\ if isinstance(authors, string_types) else authors self.authors = [Author.parse_author(a) for a in authors] # turn projects into list of Strings projects = projects if projects else [] self.projects = [projects] \ if isinstance(projects, string_types) else projects # check that references are valid BibTeX if not isinstance(references, string_types): raise ValueError("Invalid format for SNL reference! Should be " "empty string or BibTeX string.") if references and not is_valid_bibtex(references): raise ValueError("Invalid format for SNL reference! Should be " "BibTeX string.") if len(references) > MAX_BIBTEX_CHARS: raise ValueError("The BibTeX string must be fewer than {} chars " ", you have {}" .format(MAX_BIBTEX_CHARS, len(references))) self.references = references # turn remarks into list of Strings remarks = remarks if remarks else [] self.remarks = [remarks] if isinstance(remarks, string_types) \ else remarks # check remarks limit for r in self.remarks: if len(r) > 140: raise ValueError("The remark exceeds the maximum size of" "140 characters: {}".format(r)) # check data limit self.data = data if data else {} if not sys.getsizeof(self.data) < MAX_DATA_SIZE: raise ValueError("The data dict exceeds the maximum size limit of" " {} bytes (you have {})" .format(MAX_DATA_SIZE, sys.getsizeof(data))) for k, v in self.data.items(): if not k.startswith("_"): raise ValueError("data must contain properly namespaced data " "with keys starting with an underscore. The " "key {} does not start with an underscore.", format(k)) # check for valid history nodes history = history if history else [] # initialize null fields if len(history) > MAX_HNODES: raise ValueError("A maximum of {} History nodes are supported, " "you have {}!".format(MAX_HNODES, len(history))) self.history = [HistoryNode.parse_history_node(h) for h in history] if not all([sys.getsizeof(h) < MAX_HNODE_SIZE for h in history]): raise ValueError("One or more history nodes exceeds the maximum " "size limit of {} bytes".format(MAX_HNODE_SIZE)) self.created_at = created_at if created_at \ else datetime.datetime.utcnow()
[docs] def as_dict(self): d = self.structure.as_dict() d["@module"] = self.__class__.__module__ d["@class"] = self.__class__.__name__ d["about"] = {"authors": [a.as_dict() for a in self.authors], "projects": self.projects, "references": self.references, "remarks": self.remarks, "history": [h.as_dict() for h in self.history], "created_at": json.loads(json.dumps(self.created_at, cls=MontyEncoder))} d["about"].update(json.loads(json.dumps(self.data, cls=MontyEncoder))) return d
[docs] @classmethod def from_dict(cls, d): a = d["about"] dec = MontyDecoder() created_at = dec.process_decoded(a.get("created_at")) data = {k: v for k, v in d["about"].items() if k.startswith("_")} data = dec.process_decoded(data) structure = Structure.from_dict(d) if "lattice" in d \ else Molecule.from_dict(d) return cls(structure, a["authors"], projects=a.get("projects", None), references=a.get("references", ""), remarks=a.get("remarks", None), data=data, history=a.get("history", None), created_at=created_at)
[docs] @classmethod def from_structures(cls, structures, authors, projects=None, references='', remarks=None, data=None, histories=None, created_at=None): """ A convenience method for getting a list of StructureNL objects by specifying structures and metadata separately. Some of the metadata is applied to all of the structures for ease of use. Args: structures: A list of Structure objects authors: *List* of {"name":'', "email":''} dicts, *list* of Strings as 'John Doe <johndoe@gmail.com>', or a single String with commas separating authors projects: List of Strings ['Project A', 'Project B']. This applies to all structures. references: A String in BibTeX format. Again, this applies to all structures. remarks: List of Strings ['Remark A', 'Remark B'] data: A list of free form dict. Namespaced at the root level with an underscore, e.g. {"_materialsproject":<custom data>} . The length of data should be the same as the list of structures if not None. histories: List of list of dicts - [[{'name':'', 'url':'', 'description':{}}], ...] The length of histories should be the same as the list of structures if not None. created_at: A datetime object """ data = [{}] * len(structures) if data is None else data histories = [[]] * len(structures) if histories is None else \ histories snl_list = [] for i, struct in enumerate(structures): snl = StructureNL(struct, authors, projects=projects, references=references, remarks=remarks, data=data[i], history=histories[i], created_at=created_at) snl_list.append(snl) return snl_list
def __str__(self): return "\n".join(["{}\n{}".format(k, getattr(self, k)) for k in ("structure", "authors", "projects", "references", "remarks", "data", "history", "created_at")]) def __eq__(self, other): return all(map(lambda n: getattr(self, n) == getattr(other, n), ("structure", "authors", "projects", "references", "remarks", "data", "history", "created_at"))) def __ne__(self, other): return not self.__eq__(other)