import functools
import re
import typing as T
from collections import abc
from unidecode import unidecode_expect_ascii
from nominally import config
from nominally.utilities import flatten_once, remove_falsy
Cluster = T.List[str]
Clusters = T.List[Cluster]
if T.TYPE_CHECKING:
MappingBase = T.Mapping[str, str]
else:
MappingBase = abc.Mapping
def word_count_bouncer(minimum: int) -> T.Callable[[T.Any], T.Any]:
"""
Decorate only class/instance methods, enforce word count on first real arg.
If there are too few (less than minimum) words, return the arguments.
"""
WordContainer = T.Union[str, Cluster, Clusters]
def decorator_bouncer(
func: T.Callable[[T.Any, WordContainer], WordContainer]
) -> T.Callable[[T.Any, WordContainer], WordContainer]:
"""Return countable instead of func(countable) if too few words."""
@functools.wraps(func)
def wrapper_bouncer(obj: T.Any, countable: WordContainer) -> WordContainer:
checklist: T.List[T.Any]
if not countable:
return countable
if isinstance(countable, str):
checklist = countable.split()
elif isinstance(countable[0], str):
checklist = countable
else:
checklist = flatten_once(countable)
wordlist = [s for s in checklist if re.search("[a-z]", s)]
if len(wordlist) < minimum:
return countable
return func(obj, countable)
return wrapper_bouncer
return decorator_bouncer
[docs]class Name(MappingBase):
"""A personal name, separated and simplified into component parts."""
_keys = ["title", "first", "middle", "last", "suffix", "nickname"]
# https://github.com/vaneseltine/nominally/issues/47
__slots__ = _keys + ["_raw", "_has_generational", "detail", "_final", "_cleaned"]
def __init__(self, raw: str = "") -> None:
self._raw = raw
self._has_generational = False
self.detail: T.Dict[str, Cluster] = {k: [] for k in self._keys}
s = self._pre_clean(self.raw)
s = self._pre_process(s)
s = self.clean(s)
self._cleaned = self._archive_cleaned(s)
self._process(s)
self._post_process()
self._final = self._post_clean()
@staticmethod
def _pre_clean(s: str) -> str:
"""Minimal possible pre-clean
Still, e.g., allowing nickname extraction.
"""
s = str(s)
s = unidecode_expect_ascii(s)
s = s.lower()
return s
def _pre_process(self, s: str) -> str:
"""Pull pieces that need to/can be processed from a string first."""
s = self._sweep_nicknames(s)
s = self._sweep_suffixes(s)
s = self._sweep_junior(s)
self.detail["nickname"] = self._clean_cluster(self.detail["nickname"])
self.detail["suffix"] = self._clean_cluster(
self.detail["suffix"], condense=True
)
return s
[docs] @classmethod
def clean(cls, s: str, *, condense: bool = False, final: bool = False) -> str:
"""Clean this string to the simplest possible representation (but no simpler).
.. note::
Assumes that any nicknames have already been removed,
along with anything else that would depend on special
characters (other than commas).
"""
whitespace_out = "" if condense else " "
cleaning_subs = [
(r"(\s*(;|:|,))+", ", "), # convert : ; , to , with spacing
(r"\.\s*", ". "), # reduce/add space after each .
(r"[-_/\\:]+", "-"), # convert _ / \ - : to single hyphen
(r"[^-\sa-z0-9,]+", ""), # drop most all excluding - , .
(r"\s+", whitespace_out), # condense all whitespace groups
]
if final:
cleaning_subs.append((r"[^a-z0-9- \)\()]", ""))
for pattern, repl in cleaning_subs:
s = re.sub(pattern, repl, s)
s = cls.strip_pointlessness(s)
if not re.search(r"[a-z]", s):
return ""
return s
[docs] @staticmethod
def strip_pointlessness(s: str) -> str:
return s.strip("-, |")
def _archive_cleaned(self, s: str) -> T.Set[str]:
"""Return a handy representation of cleaned string(s) as a set."""
result = {s}
result.update(*(tuple(x) for x in self.detail.values() if x))
return result
def _process(self, preprocessed_str: str) -> None:
"""Primary processing of clusters into extracted name parts."""
clusters = self._string_to_clusters(preprocessed_str)
clusters = self._extract_title(clusters)
clusters = self._remove_numbers(clusters)
clusters = self._grab_junior(clusters)
self._extract_last_first_middle(clusters)
def _post_process(self) -> None:
"""Any followup processes once all name parts have been extracted."""
self.detail["suffix"].sort()
@word_count_bouncer(minimum=3)
def _sweep_suffixes(self, s: str) -> str:
"""Extract all possible (most) suffixes."""
for pat, generational in config.SUFFIX_PATTERNS.items():
if not pat.search(s):
continue
new_suffixes = pat.findall(s)
if generational:
self._has_generational = True
self.detail["suffix"] += new_suffixes
s = pat.sub("", s)
return s
@word_count_bouncer(minimum=3)
def _sweep_junior(self, s: str) -> str:
"""First pass at 'junior' via regex from string."""
if self._has_generational:
return s
new_s = config.JUNIOR_PATTERN.sub(", ", s)
if new_s != s:
self.detail["suffix"].append("jr")
return new_s
@word_count_bouncer(minimum=3)
def _sweep_nicknames(self, s: str) -> str:
"""
The content of parenthesis or quotes in the name will be added to the
nicknames list. This happens before any other processing of the name.
Single quotes cannot span white space characters and must border
white space to allow for quotes in names like O'Connor and Kawai'ae'a.
Double quotes and parenthesis can span white space.
"""
for pattern in config.NICKNAME_PATTERNS:
hit = pattern.findall(s)
if hit:
self.detail["nickname"] += hit
s = pattern.sub("", s)
return s
@staticmethod
def _string_to_clusters(remaining: str) -> Clusters:
"""Break a string into clusters by commas.
I.e., 'piranha, dinsdale j' -> [['piranha'], ['dinsdale', 'j']]
"""
cluster = re.split(r"\s*,\s*", remaining)
return [x.split() for x in cluster if x]
def _extract_title(self, clusters: Clusters) -> Clusters:
outgoing: Clusters = []
while clusters:
next_cluster = clusters.pop(0)
first_word, *remainder = next_cluster
if first_word in config.TITLES:
outgoing = outgoing + [remainder] + clusters
self.detail["title"] = [first_word]
return outgoing
outgoing.append(next_cluster)
self.detail["title"] = []
return outgoing
@classmethod
def _remove_numbers(cls, cluster: Clusters) -> Clusters:
"""Clear out all numbers.
Intended to be applied following primary generational suffix extraction.
"""
no_numbers = [[cls._deep_number_clean(x) for x in word] for word in cluster]
return remove_falsy(no_numbers)
@classmethod
def _deep_number_clean(cls, s: str) -> str:
no_numbers = re.sub(r"\d", "", s)
return cls.strip_pointlessness(no_numbers)
def _post_clean(self) -> T.Dict[str, Cluster]:
return {k: self._clean_cluster(self.detail[k]) for k in self._keys}
@classmethod
def _clean_cluster(cls, cluster: Cluster, condense: bool = False) -> Cluster:
"""Clean the string of each token in a cluster."""
cleaned = [cls.clean(s, condense=condense, final=True) for s in cluster]
return [s for s in cleaned if s]
@word_count_bouncer(minimum=3)
def _grab_junior(self, clusters: Clusters) -> Clusters:
"""
Extract "junior" as suffix unless
- there is already a generational suffix
- junior is the first word of the only cluster (e.g. 'junior x. smith')
- junior is the first word of a multi-token multi-cluster
e.g. leave as first 'smith, junior x'
leave as first 'barnes-smith, junior james'
take as suffix 'jake smith, junior'
"""
if self._has_generational:
return clusters
if "junior" not in flatten_once(clusters):
return clusters
first_word_only_cluster = len(clusters) == 1 and clusters[0][0] == "junior"
if first_word_only_cluster:
return clusters
first_word_in_likely_first_name = (
len(clusters) == 2 and len(clusters[1]) > 1 and clusters[1][0] == "junior"
)
if first_word_in_likely_first_name:
return clusters
self.detail["suffix"].append("jr")
return self._remove_from_clusters(clusters, "junior")
@staticmethod
def _remove_from_clusters(clusters: Clusters, s: str) -> Clusters:
"""Drop all strings from clusters that are equal to s"""
return [[x for x in cluster if x != s] for cluster in clusters]
def _extract_last_first_middle(self, clusters: Clusters) -> None:
"""Sequentially remove last name, first name, and collapse to middles"""
clusters = self._extract_last(remove_falsy(clusters))
clusters = self._extract_first(remove_falsy(clusters))
self.detail["middle"] = flatten_once(clusters)
@word_count_bouncer(1)
def _extract_first(self, clusters: Clusters) -> Clusters:
"""
Remove and return the first name from a prepared list of cluster
If only one cluster remains, take its leftmost word;
if more than one, take the leftmost cluster.
"""
if len(clusters) == 1:
self.detail["first"] = [clusters[0].pop(0)]
return clusters
self.detail["first"] = clusters.pop(0)
return clusters
@word_count_bouncer(1)
def _extract_last(self, clusters: Clusters) -> Clusters:
"""Remove and return the last name from a prepared list of cluster"""
# First, move any partitioned last name into rightmost cluster
if len(clusters) > 1:
clusters = self._flip_last_name_to_right(clusters)
# Now group words of the rightmost cluster and take the rightmost word
clusters[-1] = self._cluster_words(clusters[-1])
self.detail["last"] = [clusters[-1].pop(-1)]
return clusters
@classmethod
def _flip_last_name_to_right(cls, clusters: Clusters) -> Clusters:
"""Set up extraction by moving a last name to rightmost position"""
partitioned_last = " ".join(clusters.pop(0))
clusters[-1].append(partitioned_last)
return clusters
@classmethod
def _cluster_words(cls, cluster: Cluster) -> Cluster:
"""
Split list of cluster down to individual words and
- join on conjuctions if appropriate
- add prefixes to last names if appropriate
"""
cluster = cls._combine_conjunctions(cluster)
cluster = cls._combine_rightmost_prefixes(cluster)
return cluster
@classmethod
@word_count_bouncer(minimum=4)
def _combine_conjunctions(cls, cluster: Cluster) -> Cluster:
"""Accept one conjunction at the end: `bob|steve|cortez y costa`"""
*new_cluster, last_name_one, conj, last_name_two = cluster
if conj not in config.CONJUNCTIONS:
return cluster
rightmost = " ".join((last_name_one, conj, last_name_two))
new_cluster.append(rightmost)
return new_cluster
@classmethod
@word_count_bouncer(minimum=3)
def _combine_rightmost_prefixes(cls, cluster: Cluster) -> Cluster:
"""Work right-to-left through cluster, joining up prefixes of rightmost"""
result: Clusters = []
for word in reversed(cluster):
if len(result) > 1 or word not in config.PREFIXES:
result.insert(0, [word])
continue
if not result:
result = [[]]
result[0].insert(0, word)
return [" ".join(cluster) for cluster in result if cluster]
[docs] def __eq__(self, other: T.Any) -> bool:
"""If Name is parsable and object dicts are identical, consider it equal."""
try:
return dict(self) == dict(other) and self.parsable
except (ValueError, TypeError):
return NotImplemented
def __getattr__(self, name: str) -> T.Any:
"""
Provides attribute access to all of Name._keys (by way of __iter__()
for the keys and __getitem__() for the value).
"""
if name in self.keys():
return self[name]
return self.__getattribute__(name)
def __getitem__(self, key: str) -> T.Any:
"""Implement Name dict to return strings"""
return " ".join(self._final[key]) or ""
def __len__(self) -> int:
"""Implement Name dict to return strings"""
return len(self._keys)
def __iter__(self) -> T.Iterator[str]:
"""Implement Name dict to return strings"""
return iter(self._keys)
def __repr__(self) -> str:
if self.parsable:
text = str(dict(self))
else:
text = "Unparsable"
return f"{self.__class__.__name__}({text})"
[docs] def __str__(self) -> str:
"""Output format: "last, title first middle suffix (nickname)"
- "organs, mr harry x, jr (snapper)"
- "organs, mr harry x, jr"
- "organs, mr harry x"
- "organs, harry x"
- "organs, harry"
- etc.
"""
string_parts = [
f"{self.last},",
self.title,
self.first,
f"({self.nickname})" if self.nickname else "",
self.middle,
self.suffix,
]
joined = " ".join(p for p in string_parts if p)
return self.strip_pointlessness(joined)
# API
@property
def parsable(self) -> bool:
"""Return true if any valid name values were created."""
return any(x for x in self.values() if x)
@property
def raw(self) -> str:
"""Return the original input string."""
return self._raw
@property
def cleaned(self) -> T.Set[str]:
"""Return some set of cleaned string parts."""
return self._cleaned
[docs] def report(self) -> T.Dict[str, T.Any]:
"""Return a more-or-less complete parsing dict."""
return {
"raw": self.raw,
"cleaned": self.cleaned,
"parsed": str(self),
"list": list(self.values()),
**dict(self),
}