# -*- coding: utf-8 -*-
from typing import Dict, List, NamedTuple, Optional, Tuple, Union
import numpy as np
import pandas as pd
from pandas import Categorical, DataFrame, IntervalIndex, Series
from pandas.core.groupby import DataFrameGroupBy
from pymatgen import PeriodicSite, Structure
from neighbormodels.structure import label_subspecies
Neighbor = Tuple[PeriodicSite, float, int]
SiteNeighbors = List[Optional[Neighbor]]
AllNeighborDistances = List[SiteNeighbors]
NeighborDistances = Dict[str, Union[List[str], List[float], List[int]]]
[docs]class NeighborData(NamedTuple):
neighbor_count: DataFrame
sublattice_pairs: DataFrame
structure: Structure
[docs]def count_neighbors(cell_structure: Structure, r: float) -> NeighborData:
"""Builds a data frame containing neighbor counts grouped over site-index pairs
and separation distances.
:param cell_structure: A pymatgen ``Structure`` object.
:param r: Radius of sphere.
:return: A named tuple with three field names:
``neighbor_count``
A pandas ``DataFrame`` of neighbor counts aggregated over site-index pairs
and separation distances.
``sublattice_pairs``
A pandas ``DataFrame`` of neighbor distances mapped to unique bin
intervals.
``structure``
A copy of the ``Structure`` object defining the crystal structure.
"""
cell_structure = add_subspecie_labels_if_missing(cell_structure=cell_structure)
neighbor_distances_df: DataFrame = get_neighbor_distances_data_frame(
cell_structure=cell_structure, r=r
)
distance_bins_df: DataFrame = neighbor_distances_df.pipe(
define_bins_to_group_and_sort_by_distance
)
neighbor_count_df: DataFrame = neighbor_distances_df.pipe(
group_site_index_pairs_by_distance, distance_bins_df=distance_bins_df
).pipe(count_neighbors_within_distance_groups).pipe(sort_neighbors_by_site_index_i)
sublattice_pairs_df: pd.DataFrame = neighbor_count_df.pipe(
sort_and_rank_unique_sublattice_pairs
)
return NeighborData(
neighbor_count=neighbor_count_df,
sublattice_pairs=sublattice_pairs_df,
structure=cell_structure,
)
def sort_and_rank_unique_sublattice_pairs(data_frame: DataFrame) -> DataFrame:
"""Group, sort, and rank unique subspecies_ij and distance_bin columns.
:param data_frame: A pandas ``DataFrame`` of pairwise neighbor distances.
:return: A pandas ``DataFrame`` of unique sublattice pairs.
"""
subspecies_columns = ["subspecies_i", "subspecies_j"]
sublattice_columns = subspecies_columns + ["distance_bin"]
return (
data_frame.loc[:, sublattice_columns]
.drop_duplicates(subset=sublattice_columns)
.sort_values(sublattice_columns)
.assign(rank=lambda x: x.groupby(subspecies_columns).cumcount())
.reset_index(drop=True)
)
def sort_neighbors_by_site_index_i(neighbor_count_df: DataFrame) -> DataFrame:
"""Sort by site index i, then neighbor distances, then neighbor index j.
:param neighbor_count_df: A data frame of neighbor counts aggregated over
site-index pairs and separation distances.
:return: A pandas ``DataFrame`` of neighbor counts aggregated over site-index
pairs and separation distances sorted by site index i, then neighbor
distances, then neighbor index j.
"""
return neighbor_count_df.sort_values(by=["i", "distance_bin", "j"]).reset_index(
drop=True
)
[docs]def count_neighbors_within_distance_groups(
grouped_distances: DataFrameGroupBy,
) -> DataFrame:
"""Count number of neighbors within each group of same-distance site-index pairs.
:param grouped_distances: A data frame grouped over site-index pairs, subspecies
pairs, and bin intervals.
:return: A pandas ``DataFrame`` of neighbor counts aggregated over site-index pairs
and separation distances.
"""
return (
grouped_distances.apply(
lambda x: pd.to_numeric(arg=x["distance_ij"].count(), downcast="integer")
)
.rename("n")
.reset_index()
)
[docs]def group_site_index_pairs_by_distance(
neighbor_distances_df: DataFrame, distance_bins_df: DataFrame
) -> DataFrameGroupBy:
"""Iterate over all sites, grouping by site-index pairs, subspecies pairs, and
bin intervals.
:param neighbor_distances_df: A pandas ``DataFrame`` containing all pairwise
neighbor distances.
:param distance_bins_df: A pandas ``DataFrame`` of neighbor distances mapped to
unique bin intervals.
:return: A data frame grouped over site-index pairs, subspecies pairs, and
bin intervals.
"""
binned_distances: Series = pd.cut(
x=neighbor_distances_df["distance_ij"], bins=distance_bins_df.index
).rename("distance_bin")
return neighbor_distances_df.groupby(
["i", "j", "subspecies_i", "subspecies_j", binned_distances]
)
[docs]def define_bins_to_group_and_sort_by_distance(
neighbor_distances_df: DataFrame,
) -> DataFrame:
"""Defines bin intervals to group and sort neighbor pairs by distance.
:param neighbor_distances_df: A pandas ``DataFrame`` of pairwise neighbor
distances.
:return: A pandas ``DataFrame`` of neighbor distances mapped to unique bin
intervals.
"""
unique_distances: np.ndarray = find_unique_distances(
distance_ij=neighbor_distances_df["distance_ij"]
)
bin_intervals: IntervalIndex = define_bin_intervals(
unique_distances=unique_distances
)
return DataFrame(
data={
"distance_bin": Categorical(values=bin_intervals, ordered=True),
"distance_ij": Categorical(values=unique_distances, ordered=True),
},
index=bin_intervals,
)
[docs]def find_unique_distances(distance_ij: Series) -> np.ndarray:
"""Finds the unique distances that define the neighbor groups.
:param distance_ij: A pandas ``Series`` of pairwise neighbor distances.
:return: An array of unique neighbor distances.
"""
unique_floats: np.ndarray = np.sort(distance_ij.unique())
next_distance_not_close: np.ndarray = np.logical_not(
np.isclose(unique_floats[1:], unique_floats[:-1])
)
return np.concatenate(
(unique_floats[:1], unique_floats[1:][next_distance_not_close])
)
[docs]def define_bin_intervals(unique_distances: np.ndarray) -> IntervalIndex:
"""Constructs bin intervals used to group over neighbor distances.
This binning procedure provides a robust method for grouping data based on a
variable with a float data type.
:param unique_distances: An array of neighbor distances returned by asking
pandas to return the unique distances.
:return: A pandas ``IntervalIndex`` defining bin intervals can be used to sort
and group neighbor distances.
"""
bin_centers: np.ndarray = np.concatenate(([0], unique_distances))
bin_edges: np.ndarray = np.concatenate(
[
bin_centers[:-1] + (bin_centers[1:] - bin_centers[:-1]) / 2,
bin_centers[-1:] + (bin_centers[-1:] - bin_centers[-2:-1]) / 2,
]
)
return IntervalIndex.from_breaks(breaks=bin_edges)
[docs]def get_neighbor_distances_data_frame(cell_structure: Structure, r: float) -> DataFrame:
"""Get data frame of pairwise neighbor distances for each atom in the unit cell,
out to a distance ``r``.
:param cell_structure: A pymatgen ``Structure`` object.
:param r: Radius of sphere.
:return: A pandas ``DataFrame`` of pairwise neighbor distances.
"""
all_neighbors: AllNeighborDistances = cell_structure.get_all_neighbors(
r=r, include_index=True
)
neighbor_distances: NeighborDistances = extract_neighbor_distance_data(
cell_structure=cell_structure, all_neighbors=all_neighbors
)
return DataFrame(data=neighbor_distances)
[docs]def append_site_i_neighbor_distance_data(
site_i_index: int,
site_i_neighbors: SiteNeighbors,
cell_structure: Structure,
neighbor_distances: NeighborDistances,
) -> None:
"""Helper function to append indices, species, and distances in the
``neighbor_distances`` dictionary.
:param site_i_index: Site index of first site in neighbor pair.
:param site_i_neighbors: A list of site i's neighbors.
:param cell_structure: The pymatgen ``Structure`` object that defines the crystal
structure.
:param neighbor_distances: A dictionary of site indices, site species, and neighbor
distances for each pair.
"""
for site_j in site_i_neighbors:
subspecies_pair: List[str] = [
cell_structure[site_i_index].properties["subspecie"],
cell_structure[site_j[2]].properties["subspecie"],
]
index_pair: List[str] = [site_i_index, site_j[2]]
neighbor_distances["i"].append(index_pair[0])
neighbor_distances["j"].append(index_pair[1])
neighbor_distances["subspecies_i"].append(subspecies_pair[0])
neighbor_distances["subspecies_j"].append(subspecies_pair[1])
neighbor_distances["distance_ij"].append(site_j[1])
[docs]def add_subspecie_labels_if_missing(cell_structure: Structure) -> Structure:
"""Makes a copy of ``cell_structure`` and then checks if ``cell_structure`` has
the subspecie site property. If it does, then return the copy as-is, otherwise
label each site of the copy using the site's atomic specie name and then return
it.
:param cell_structure: A pymatgen ``Structure`` object.
:return: An exact copy of the input ``cell_structure`` object with subspecie
labels added, if missing.
"""
cell_structure = cell_structure.copy()
if "subspecie" not in cell_structure.site_properties:
label_subspecies(cell_structure=cell_structure, site_indices=[])
return cell_structure