Skip to content

Commit

Permalink
refactor and fix accessing empty string error in parsing authors ID f…
Browse files Browse the repository at this point in the history
…ield
  • Loading branch information
tleedepriest committed Aug 15, 2023
1 parent 941f20e commit 6acad51
Showing 1 changed file with 63 additions and 55 deletions.
118 changes: 63 additions & 55 deletions litstudy/sources/scopus_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,103 +42,111 @@ def __init__(self, entry):

@property
def title(self) -> Optional[str]:
return self.entry.get("Title") or None
return self.entry.get("Title")

def _get_authors(self) -> Optional[List[str]]:
def _parse_authors(self, auths: str) -> List[str]:
"""
helper method. parse two formats of Authors field.
Formats:
1. deliminates author by , and [No author name available]
2. deliminates author by ; and [No Authors found]
"""
auths = self.entry.get("Authors")
no_authors_formats = ["[No Authors Found]", "[No author name available]"]
if auths in no_authors_formats:
return []
1. deliminated by , or ;
if either deliminator isn't present, will assume
only one author in field
"""
if ";" in auths:
author_list = auths.split("; ")
elif "," in auths:
author_list = auths.split(", ")
# add comma between auth last, first to match format
# in authors with affiliations
author_list = [', '.join(auth.rsplit(" ", 1)) for auth in author_list]
else: # single author...
author_list = [auths]
author_list = [", ".join(auth.rsplit(" ", 1)) for auth in author_list]

else: # single author...
author_list = [auths]
return author_list

def _get_authors_ids(self) -> Optional[List[str]]:
def _get_authors_ids(self) -> List[str]:
"""
helper method to parse two formats of
'Author(s) ID' field
1. AUTHOR_ID;AUTHOR_ID_2;LAST_AUTHOR_ID;
2. AUTHOR_ID; AUTHOR_ID_2; LAST AUTHOR_ID
"""
auths_id = self.entry.get("Author(s) ID", "")
auths_id = self.entry.get("Author(s) ID")

if auths_id == "":
return []

if auths_id[-1] == ";":
auths_id = auths_id[:-1]

auths_ids = auths_id.split(";")
auths_ids = [auth_id.lstrip().rstrip() for auth_id in auths_ids]
return auths_ids

def _get_auths_with_ids(self):
auths = self._get_authors()
auths_id = self._get_authors_ids()
def _try_to_add_ids_to_authors(self, auths: List[str]) -> List[str]:
"""
auths is non-zero length list.
"""
auths_ids = self._get_authors_ids()

if len(auths) == len(auths_id):
auths_w_ids = [f"{name} (ID: {auth_id})" for name, auth_id in zip(auths, auths_id)]
else:
auths_w_ids = []
return auths_w_ids
if len(auths_ids) == len(auths) and len(auths) > 0:
auths_w_ids = [f"{name} (ID: {auth_id})" for name, auth_id in zip(auths, auths_ids)]
return auths_w_ids
return auths

@staticmethod
def _get_affiliations(affiliation_substring):
affiliations = affiliation_substring.split(";")
affiliations = [
ScopusCsvAffiliation(aff) for aff in affiliations]
return affiliations
def _parse_affiliations(self, affs) -> List[str]:
if affs == "":
return []
return affs.split(";")

@property
def authors(self) -> List[ScopusCsvAuthor]:
auths = self._get_authors()
auths_affs = self.entry.get("Authors with affiliations")
auths_ids = self._get_authors_ids()
affs = self.entry.get("Affiliations")
# author_last, first initial, affiliation; .....
# try to add id to author name
if not auths_affs or auths is None or auths_ids is None:
auths = self.entry.get("Authors")
no_authors_formats = ["[No Authors Found]", "[No author name available]"]

if auths == "" or auths in no_authors_formats:
return []

auths_w_ids = self._get_auths_with_ids()
# use auths to search in auths_with_affs string
auths = self._parse_authors(auths)
authors_with_ids = self._try_to_add_ids_to_authors(auths)

affs = self.entry.get("Affiliations")
affs = self._parse_affiliations(affs)
# if single author, no way to know if ',' in author name
# within auths_affs field,
# so need to use 'Affiliations' field instead
# of searching for author in auths_affs
# within auths_affs field (can't search string),
# use 'Affiliations' field.

if len(auths) == 1:
return [ScopusCsvAuthor(auths[0], self._get_affiliations(affs))]
return [
ScopusCsvAuthor(authors_with_ids[0], [ScopusCsvAffiliation(aff) for aff in affs])
]

auths_affs = self.entry.get("Authors with affiliations")
if auths_affs == "": # can't map affiliations to authors
return [ScopusCsvAuthor(auth, []) for auth in authors_with_ids]

indexes_of_authors = [auths_affs.index(auth) for auth in auths]
auth_to_affs_mapping = {}

for num, index in enumerate(indexes_of_authors):
auth = auths[num]
if num < len(indexes_of_authors) - 1:
#auth = auths[num]

next_index = indexes_of_authors[num+1]
substring = auths_affs[index:next_index]
# only want part of string for current auhtor
if num < len(indexes_of_authors) - 1:
next_index = indexes_of_authors[num + 1]
cur_auth_affils = auths_affs[index:next_index]
# only want part of string for current author
# and affiliations
else:
substring = auths_affs[index:]

substring = substring.replace(f"{auth}, ", "")
affiliations = substring.split(";")
affiliations = [
ScopusCsvAffiliation(aff) for aff in affiliations]
if auths_w_ids!=[]:
auth_to_affs_mapping[auths_w_ids[num]] = affiliations
else:
auth_to_affs_mapping[auth] = affiliations
cur_auth_affils = auths_affs[index:]

# cur_auth_affils = substring.replace(f"{auth}, ", "")
# could be multiple affiliates, but no clear deliminator
cur_auth_affils = [ScopusCsvAffiliation(a) for a in affs if a in cur_auth_affils]
auth_to_affs_mapping[authors_with_ids[num]] = cur_auth_affils

return [ScopusCsvAuthor(a, b) for a, b in auth_to_affs_mapping.items()]

Expand Down

0 comments on commit 6acad51

Please sign in to comment.