From 19db43fd608ca0acb2658c42f0707cc10e4582f1 Mon Sep 17 00:00:00 2001 From: Travis Date: Tue, 29 Aug 2023 19:38:40 -0400 Subject: [PATCH] add support for two different types of csv files when downloading data from scopus. Make doc attributes consistent with scopus source. --- litstudy/sources/scopus_csv.py | 41 ++-- tests/conftest.py | 9 + tests/test_sources_scopus_csv.py | 329 +++++++++++++++++++++++-------- 3 files changed, 282 insertions(+), 97 deletions(-) create mode 100644 tests/conftest.py diff --git a/litstudy/sources/scopus_csv.py b/litstudy/sources/scopus_csv.py index 05b0d20..13d4ff0 100644 --- a/litstudy/sources/scopus_csv.py +++ b/litstudy/sources/scopus_csv.py @@ -87,9 +87,6 @@ def _get_authors_ids(self) -> List[str]: return auths_ids def _try_to_add_ids_to_authors(self, auths: List[str]) -> List[str]: - """ - auths is non-zero length list. - """ auths_ids = self._get_authors_ids() if len(auths_ids) == len(auths) and len(auths) > 0: @@ -100,18 +97,19 @@ def _try_to_add_ids_to_authors(self, auths: List[str]) -> List[str]: def _parse_affiliations(self, affs) -> List[str]: if affs == "": return [] - return affs.split(";") + return [aff.lstrip().rstrip() for aff in affs.split(";")] @property - def authors(self) -> List[ScopusCsvAuthor]: + def authors(self) -> Optional[List[ScopusCsvAuthor]]: auths = self.entry.get("Authors") no_authors_formats = ["[No Authors Found]", "[No author name available]"] if auths == "" or auths in no_authors_formats: - return [] + return None - # use auths to search in auths_with_affs string + # use auths to search in auths_with_affs string. auths = self._parse_authors(auths) + # use auths_with_ids for unique field. authors_with_ids = self._try_to_add_ids_to_authors(auths) affs = self.entry.get("Affiliations") @@ -119,7 +117,6 @@ def authors(self) -> List[ScopusCsvAuthor]: # if single author, no way to know if ',' in author name # within auths_affs field (can't search string), # use 'Affiliations' field. - if len(auths) == 1: return [ ScopusCsvAuthor(authors_with_ids[0], [ScopusCsvAffiliation(aff) for aff in affs]) @@ -127,27 +124,39 @@ def authors(self) -> List[ScopusCsvAuthor]: auths_affs = self.entry.get("Authors with affiliations") if auths_affs == "": # can't map affiliations to authors - return [ScopusCsvAuthor(auth, []) for auth in authors_with_ids] + return [ScopusCsvAuthor(auth, None) for auth in authors_with_ids] indexes_of_authors = [auths_affs.index(auth) for auth in auths] auth_to_affs_mapping = {} for num, index in enumerate(indexes_of_authors): - #auth = auths[num] + # auth = auths[num] if num < len(indexes_of_authors) - 1: next_index = indexes_of_authors[num + 1] cur_auth_affils = auths_affs[index:next_index] - # only want part of string for current author - # and affiliations else: cur_auth_affils = auths_affs[index:] - # cur_auth_affils = substring.replace(f"{auth}, ", "") # could be multiple affiliates, but no clear deliminator - cur_auth_affils = [ScopusCsvAffiliation(a) for a in affs if a in cur_auth_affils] - auth_to_affs_mapping[authors_with_ids[num]] = cur_auth_affils + affs_filtered = [a for a in affs if a in cur_auth_affils] + affs_filtered = sorted(affs_filtered, key=lambda x: len(x)) + # edge case is str in affs is substr of aff in cur_auth_affs + + # removes edge case where aff is substring of other aff + disclude = [] + short_string = affs_filtered[0] + for j in range(0, len(affs_filtered) - 1): + long_strings = affs_filtered[j + 1 :] + for ls in long_strings: + if short_string in ls: + disclude.append(short_string) + short_string = affs_filtered[j + 1] + + auth_to_affs_mapping[authors_with_ids[num]] = [ + ScopusCsvAffiliation(a) for a in affs_filtered if a not in disclude + ] return [ScopusCsvAuthor(a, b) for a, b in auth_to_affs_mapping.items()] @property @@ -169,7 +178,7 @@ def publication_year(self) -> Optional[int]: def keywords(self) -> Optional[List[str]]: keywords = self.entry.get("Author Keywords") if not keywords: - return None + return [] return keywords.split("; ") @property diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..c81ce27 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +import os +from litstudy.sources.scopus_csv import load_scopus_csv + + +def pytest_generate_tests(metafunc): + path = os.path.dirname(__file__) + "/resources/scopus.csv" + docs = load_scopus_csv(path) + if "doc" in metafunc.fixturenames: + metafunc.parametrize("doc", docs) diff --git a/tests/test_sources_scopus_csv.py b/tests/test_sources_scopus_csv.py index 6323149..d79a46a 100644 --- a/tests/test_sources_scopus_csv.py +++ b/tests/test_sources_scopus_csv.py @@ -1,90 +1,257 @@ import os +import pytest from litstudy.sources.scopus_csv import load_scopus_csv -#def test_doc_title_is_string(doc): -# assert isinstance(doc.title, str) -#def test_doc_publication_year_is_int(doc): -# assert isinstance(doc.publication_year, int) +def test_doc_id_doi_string(doc): + assert isinstance(doc.id.doi, str) or doc.id.doi is None -#def test_doc_keywords_is_list_or_none(doc): -# assert isinstance(doc.keywords, list) or doc.keywords is None -#def test_doc_authors(doc): -# authors = doc.author +def test_doc_title_is_string(doc): + assert isinstance(doc.title, str) -def test_load_scopus_csv_v2(): - path = os.path.dirname(__file__) + "/resources/scopus_v2.csv" - docs = load_scopus_csv(path) - for num, doc in enumerate(docs): - title = doc.title - doc_id_title = doc.id.title - doc_id_doi = doc.id.doi - doc_id_pubmed = doc.id.pubmed - doc_id_scopus = doc.id.scopusid - pub_year = doc.publication_year - keywords = doc.keywords - abstract = doc.abstract - citation_count = doc.citation_count - publication_source = doc.publication_source - source_type = doc.source_type - for author in doc.authors: - author_name = author.name - for aff in author.affiliations: - affiliation = aff.name - if num == 0: - assert title == doc_id_title - assert doc.title.startswith("Gender-specific visual perturbation effects") - assert doc.abstract.startswith("This study investigated the effects of different visual rotation speeds") - assert doc.publication_source == "Ergonomics" - assert doc.language == "English" - assert doc.publisher == "Taylor and Francis Ltd." - assert doc.citation_count == 0 - assert doc.keywords == [ - 'electromyography', - 'Gait', - 'simulation', - 'space medicine', - 'visual flow'] - assert doc.publication_year == 2023 - assert len(doc.authors) == 3 - assert doc.authors[0].name == "Hao J. (ID: 57221302630)" - assert doc.authors[0].affiliations[0].name == "Department of Health & Rehabilitation Sciences, College of Allied Health Professions, University of Nebraska Medical Center, Omaha, NE, United States" - -def test_load_scopus_csv(): + +def test_doc_publication_year_is_int(doc): + assert isinstance(doc.publication_year, int) + + +def test_doc_keywords_elements(doc): + assert all([isinstance(kw, str) for kw in doc.keywords]) + + +def test_doc_authors_elements(doc): + da = doc.authors + if da is not None: + assert all([hasattr(auth, "name") for auth in da]) + + +def test_doc_affiliations_elements(doc): + da = doc.authors + if da is not None: + for auth in doc.authors: + affs = auth.affiliations + if affs is not None: + assert all([hasattr(aff, "name") for aff in affs]) + + +@pytest.fixture +def doc_set_v1(): path = os.path.dirname(__file__) + "/resources/scopus.csv" docs = load_scopus_csv(path) - for num, doc in enumerate(docs): - title = doc.title - doc_id_title = doc.id.title - doc_id_doi = doc.id.doi - doc_id_pubmed = doc.id.pubmed - doc_id_scopus = doc.id.scopusid - pub_year = doc.publication_year - keywords = doc.keywords - abstract = doc.abstract - citation_count = doc.citation_count - publication_source = doc.publication_source - source_type = doc.source_type - for author in doc.authors: - author_name = author.name - for aff in author.affiliations: - affiliation = aff.name - if num == 0: - assert title == doc_id_title - assert doc.title == "Scalable molecular dynamics with NAMD" - assert doc.abstract.startswith("NAMD is a parallel molecular dynamics code") - assert doc.publication_source == "Journal of Computational Chemistry" - assert doc.language == "English" - assert doc.publisher == "John Wiley and Sons Inc." - assert doc.citation_count == 13169 - assert doc.keywords == [ - "Biomolecular simulation", - "Molecular dynamics", - "Parallel computing", - ] - assert doc.publication_year == 2005 - - assert len(doc.authors) == 10 - assert doc.authors[0].name == "Phillips, J.C. (ID: 57202138757)" - assert doc.authors[0].affiliations[0].name == "Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + return docs + + +@pytest.fixture +def doc_set_v2(): + path = os.path.dirname(__file__) + "/resources/scopus_v2.csv" + docs = load_scopus_csv(path) + return docs + + +@pytest.fixture +def v1r0(doc_set_v1): + return doc_set_v1[0] + + +@pytest.fixture +def v2r0(doc_set_v2): + return doc_set_v2[0] + + +@pytest.fixture +def v2r36(doc_set_v2): + return doc_set_v2[36] + + +@pytest.fixture +def v1r1(doc_set_v1): + return doc_set_v1[1] + + +@pytest.fixture +def v1r6(doc_set_v1): + return doc_set_v1[6] + + +def test_v1r0_title(v1r0): + assert v1r0.title == "Scalable molecular dynamics with NAMD" + + +def test_v2r0_title(v2r0): + assert v2r0.title.startswith("Gender-specific visual perturbation effects") + + +def test_v1r0_abstract(v1r0): + assert v1r0.abstract.startswith("NAMD is a parallel molecular dynamics code") + + +def test_v2r0_abstract(v2r0): + assert v2r0.abstract.startswith( + "This study investigated the effects of different visual rotation speeds" + ) + + +def test_v1r0_publication_source(v1r0): + assert v1r0.publication_source == "Journal of Computational Chemistry" + + +def test_v2r0_publication_source(v2r0): + assert v2r0.publication_source == "Ergonomics" + + +def test_v1r0_language(v1r0): + assert v1r0.language == "English" + + +def test_v2r0_language(v2r0): + assert v2r0.language == "English" + + +def test_v1r0_publisher(v1r0): + assert v1r0.publisher == "John Wiley and Sons Inc." + + +def test_v2r0_publisher(v2r0): + assert v2r0.publisher == "Taylor and Francis Ltd." + + +def test_v1r0_citation_count(v1r0): + assert v1r0.citation_count == 13169 + + +def test_v2r0_citation_count(v2r0): + assert v2r0.citation_count == 0 + + +def test_v1r0_publication_year(v1r0): + assert v1r0.publication_year == 2005 + + +def test_v2r0_publication_year(v2r0): + assert v2r0.publication_year == 2023 + + +def test_v1r0_keywords(v1r0): + assert v1r0.keywords == [ + "Biomolecular simulation", + "Molecular dynamics", + "Parallel computing", + ] + + +def test_v1r1_keywords(v1r1): + assert v1r1.keywords == [] + + +def test_v2r0_keywords(v2r0): + assert v2r0.keywords == [ + "electromyography", + "Gait", + "simulation", + "space medicine", + "visual flow", + ] + + +def test_v1r0_authors(v1r0): + assert [auth.name for auth in v1r0.authors] == [ + "Phillips, J.C. (ID: 57202138757)", + "Braun, R. (ID: 7402220509)", + "Wang, W. (ID: 56948551400)", + "Gumbart, J. (ID: 8553717000)", + "Tajkhorshid, E. (ID: 6701753117)", + "Villa, E. (ID: 8412476700)", + "Chipot, C. (ID: 7003715790)", + "Skeel, R.D. (ID: 7005206020)", + "Kalé, L. (ID: 7005862685)", + "Schulten, K. (ID: 7102415947)", + ] + + +def test_v1r1_authors(v1r1): + assert [auth.name for auth in v1r1.authors] == ["Murata T. (ID: 7402736947)"] + + +def test_v2r0_authors(v2r0): + assert [auth.name for auth in v2r0.authors] == [ + "Hao J. (ID: 57221302630)", + "High R. (ID: 6701683718)", + "Siu K.-C. (ID: 57192938181)", + ] + + +def test_v2r0_author_affiliation(v2r0): + assert [[aff.name for aff in auth.affiliations] for auth in v2r0.authors] == [ + [ + "Department of Health & Rehabilitation Sciences, College of Allied Health Professions, University of Nebraska Medical Center, Omaha, NE, United States" + ], + [ + "Department of Biostatistics, College of Public Health, University of Nebraska Medical Center, Omaha, NE, United States" + ], + [ + "Department of Health & Rehabilitation Sciences, College of Allied Health Professions, University of Nebraska Medical Center, Omaha, NE, United States" + ], + ] + + +def test_v1r0_author_affiliation(v1r0): + assert [[aaff.name for aaff in auth.affiliations] for auth in v1r0.authors] == [ + [ + "Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + ], + [ + "Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + ], + [ + "Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + ], + [ + "Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + ], + [ + "Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + ], + [ + "Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + ], + ["UMR CNRS/UHP 7565, Université Henri Poincaré, 54506 Vandaeuvre-les-Nancy, Cedex, France"], + [ + "Department of Computer Science, Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + ], + [ + "Department of Computer Science, Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + ], + [ + "Beckman Institute, University of Illinois at Urbana-Champaign, Urbana, IL 61801, United States" + ], + ] + + +def test_v1r6_author_affiliation(v1r6): + assert [[aaff.name for aaff in auth.affiliations] for auth in v1r6.authors] == [ + [ + "Department of Human Genetics, And Women's Hospital, Zhejiang University School of Medicine, Hangzhou, Zhejiang, China", + "Zhejiang Provincial Key Laboratory of Genetic & Developmental Disorders, Zhejiang University School of Medicine, Hangzhou, Zhejiang, China", + ], + [ + "Department of Ultrasound, Women's Hospital, Zhejiang University School of Medicine, Hangzhou, Zhejiang, China" + ], + [ + "Graduate School of Information Science and Technology, University of Tokyo, Tokyo, Japan" + ], + [ + "Department of Human Genetics, And Women's Hospital, Zhejiang University School of Medicine, Hangzhou, Zhejiang, China", + "Zhejiang Provincial Key Laboratory of Genetic & Developmental Disorders, Zhejiang University School of Medicine, Hangzhou, Zhejiang, China", + ], + [ + "Center for Genomic Medicine, Graduate School of Medicine, Kyoto University, Shogoinkawahara-cho, Kyoto-City, Kyoto, Sakyo-ku, Japan" + ], + [ + "Laboratory of DNA Information Analysis, Human Genome Center, Institute of Medical Science, University of Tokyo, Tokyo, Japan" + ], + ] + + +def test_v2r36_author_affiliation(v2r36): + for auth in v2r36.authors: + assert auth.affiliations == None