From 46cd613b5f8c7fd2af810b23bbfb99823b3ddcc5 Mon Sep 17 00:00:00 2001 From: Fabrizio Damicelli Date: Sun, 31 Mar 2024 16:22:07 +0200 Subject: [PATCH] handle comma in description --- src/ficamp/classifier/preprocessing.py | 10 ++++++++++ tests/test_preprocessing.py | 23 +++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/ficamp/classifier/preprocessing.py b/src/ficamp/classifier/preprocessing.py index 0c56791..8f13bb5 100644 --- a/src/ficamp/classifier/preprocessing.py +++ b/src/ficamp/classifier/preprocessing.py @@ -21,11 +21,21 @@ def remove_pipes(s: str) -> str: return " ".join(s.split("|")) +def remove_colon(s: str) -> str: + return " ".join(s.split(":")) + + +def remove_comma(s: str) -> str: + return " ".join(s.split(",")) + + def preprocess(s: str) -> str: "Clean up transaction description" steps = ( lambda s: s.lower(), remove_pipes, + remove_colon, + remove_comma, remove_digits, ) out = s diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 8063ebd..17e30de 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -2,6 +2,8 @@ from ficamp.classifier.preprocessing import ( preprocess, + remove_colon, + remove_comma, remove_digits, remove_pipes, ) @@ -35,6 +37,25 @@ def test_remove_pipes(inp, exp): assert remove_pipes(inp) == exp +@pytest.mark.parametrize( + ("inp,exp"), + ( + ("CSIDNL0213324324324", "CSIDNL0213324324324"), + ("CSID:NL0213324324324", "CSID NL0213324324324"), + ), +) +def test_remove_colon(inp, exp): + assert remove_colon(inp) == exp + + +@pytest.mark.parametrize( + ("inp,exp"), + (("CSID,NL0213324324324", "CSID NL0213324324324"),), +) +def test_remove_comma(inp, exp): + assert remove_comma(inp) == exp + + @pytest.mark.parametrize( ("inp,exp"), ( @@ -47,6 +68,8 @@ def test_remove_pipes(inp, exp): ("SEPA 12312321 bic", "sepa bic"), ("SEPA 12312321 123BIC", "sepa"), ("SEPA 1231|AMSTERDAM 123BIC", "sepa amsterdam"), + ("CSID:NL0213324324324", "csid"), + ("CSID:NL0213324324324 HELLO,world1332", "csid hello"), ), ) def test_preprocess(inp, exp):