test: add python-code-header-cleanser test

Signed-off-by: aavarghese <avarghese@us.ibm.com>
IBM · Oct 16, 2024 · c9e366c · c9e366c
1 parent a85ac34
commit c9e366c
Show file tree

Hide file tree

Showing 12 changed files with 388 additions and 0 deletions.
diff --git a/tests/tests/python-code-header-cleanser/pail/.helmignore b/tests/tests/python-code-header-cleanser/pail/.helmignore
@@ -0,0 +1 @@
+test-data/
diff --git a/tests/tests/python-code-header-cleanser/pail/app.yaml b/tests/tests/python-code-header-cleanser/pail/app.yaml
@@ -0,0 +1,20 @@
+apiVersion: lunchpail.io/v1alpha1
+kind: Application
+metadata:
+  name: header_cleanser
+spec:
+  role: worker
+  image: docker.io/python:3.12
+  command: python3 ./main.py
+  needs:
+    - name: python
+      version: latest
+      requirements: |
+{{ .Files.Get "data/requirements.txt" | indent 8 }} 
+  code:
+    - name: main.py
+      source: |
+{{ .Files.Get "src/main.py" | indent 8 }}
+    - name: header_cleanser_transform.py
+      source: |
+{{ .Files.Get "src/header_cleanser_transform.py" | indent 8 }}
diff --git a/tests/tests/python-code-header-cleanser/pail/data/requirements.txt b/tests/tests/python-code-header-cleanser/pail/data/requirements.txt
@@ -0,0 +1,5 @@
+data-prep-toolkit==0.2.2.dev1
+scancode-toolkit[full]
+
+pyarrow
+setuptools
diff --git a/tests/tests/python-code-header-cleanser/pail/pool1.yaml b/tests/tests/python-code-header-cleanser/pail/pool1.yaml
@@ -0,0 +1,11 @@
+{{- range until (.Values.pools | default 1 | int) }}
+---
+apiVersion: lunchpail.io/v1alpha1
+kind: WorkerPool
+metadata:
+  name: {{ print "pool" (add 1 .) }}
+spec:
+  workers:
+    count: {{ $.Values.workers | default 1 }}
+    size: {{ $.Values.size | default "xxs" }}
+{{- end }}
diff --git a/tests/tests/python-code-header-cleanser/pail/src/header_cleanser_transform.py b/tests/tests/python-code-header-cleanser/pail/src/header_cleanser_transform.py
@@ -0,0 +1,224 @@
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+################################################################################
+
+import os
+import tempfile
+from argparse import ArgumentParser, Namespace
+
+import pyarrow as pa
+from data_processing.runtime.pure_python.runtime_configuration import (
+    PythonTransformRuntimeConfiguration,
+)
+from data_processing.transform import AbstractTableTransform, TransformConfiguration
+from data_processing.utils import CLIArgumentProvider, get_logger, str2bool
+from scancode import api
+
+
+logger = get_logger(__name__)
+
+short_name = "header_cleanser"
+cli_prefix = short_name + "_"
+COLUMN_KEY = "contents_column_name"
+LICENSE_KEY = "license"
+COPYRIGHT_KEY = "copyright"
+
+column_cli_params = f"{cli_prefix}{COLUMN_KEY}"
+license_cli_params = f"{cli_prefix}{LICENSE_KEY}"
+copyright_cli_params = f"{cli_prefix}{COPYRIGHT_KEY}"
+
+DEFAULT_COLUMN = "contents"
+DEFAULT_LICENSE = True
+DEFAULT_COPYRIGHT = True
+
+
+def file_generate(content):
+    """
+    Generate temporary file so that it can be passed to scancode-toolkit.
+    """
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
+            temp_file.write(content.encode("utf-8"))
+            temp_file_path = temp_file.name
+    except Exception as e:
+        print(f"Failed to create file : {e}")
+    return temp_file_path
+
+
+def fetch_index(dict_data):
+    """
+    Extract License and copyright start and endline from dictonary
+    """
+    ignore_lines = []
+    if dict_data.get("license_detections") != None:
+        for licenses in dict_data.get("license_detections"):
+            for match in licenses.get("matches"):
+                start_line = match["start_line"] - 1
+                end_line = match["end_line"] - 1
+                ignore_lines.extend([i for i in range(start_line, end_line + 1)])
+
+    if dict_data.get("copyrights") != None:
+        for copyrights in dict_data.get("copyrights"):
+            start_line = copyrights.get("start_line") - 1
+            end_line = copyrights.get("end_line") - 1
+            ignore_lines.extend([i for i in range(start_line, end_line + 1)])
+
+    return ignore_lines
+
+
+def check_empty_comment(code, ignore_lines):
+    min_index = min(ignore_lines)
+    max_index = max(ignore_lines)
+    code_list = code.split("\n")
+    if min_index != 0:
+        min_index = min_index - 1
+
+    if max_index <= len(code_list):
+        max_index = max_index + 2
+
+    for index in range(min_index, max_index):
+        if all(
+            not isinstance(x, (int, float, complex))
+            and not isinstance(x, str)
+            or (isinstance(x, str) and not x.isalnum())
+            for x in code_list[index]
+        ):
+            if index not in ignore_lines:
+                ignore_lines.append(index)
+
+    return ignore_lines
+
+
+def remove_copyright(code):
+    """
+    Using scancode.api function to detecte and remove copyright.
+    """
+    file_path = file_generate(content=code)
+    copyright_dict = api.get_copyrights(file_path)
+    os.remove(file_path)
+    ignore_lines = fetch_index(copyright_dict)
+    if ignore_lines != []:
+        modified_code = "\n".join([line for i, line in enumerate(code.split("\n"), 0) if i not in ignore_lines])
+        return modified_code, ignore_lines != []
+    else:
+        return code, False
+
+
+def remove_license(code):
+    """
+    Using scancode.api function to detecte and remove license.
+    """
+    file_path = file_generate(content=code)
+    license_dict = api.get_licenses(file_path)
+    os.remove(file_path)
+    ignore_lines = fetch_index(license_dict)
+    if ignore_lines != []:
+        modified_code = "\n".join([line for i, line in enumerate(code.split("\n"), 0) if i not in ignore_lines])
+        return modified_code, ignore_lines != []
+    else:
+        return code, False
+
+
+def remove_license_copyright(code):
+
+    file_path = file_generate(code)
+    copyright_dict = api.get_copyrights(file_path)
+    license_dict = api.get_licenses(file_path)
+    os.remove(file_path)
+    ignore_lines_license = fetch_index(license_dict)
+    ignore_lines_copyright = fetch_index(copyright_dict)
+    ignore_lines = ignore_lines_license + ignore_lines_copyright
+    if ignore_lines != []:
+        ignore_lines = check_empty_comment(code, ignore_lines)
+        modified_code = "\n".join([line for i, line in enumerate(code.split("\n"), 0) if i not in ignore_lines])
+        return modified_code, True
+    else:
+        return code, False
+
+
+class HeaderCleanserTransform(AbstractTableTransform):
+    def __init__(self, config: dict):
+        super().__init__(config)
+
+        self.column_name = config.get(COLUMN_KEY, DEFAULT_COLUMN)
+        self.license_remove = config.get(LICENSE_KEY, DEFAULT_LICENSE)
+        self.copyright_remove = config.get(COPYRIGHT_KEY, DEFAULT_COPYRIGHT)
+
+    def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict]:
+
+        contents = table.column(self.column_name).to_pylist()
+        updated_content = []
+        remove_code_count = 0
+        for content in contents:
+            if self.license_remove and self.copyright_remove:
+                new_content, detect = remove_license_copyright(content)
+                if detect:
+                    remove_code_count += 1
+                updated_content.append(new_content)
+
+            elif self.copyright_remove:
+                new_content, detect = remove_copyright(content)
+                if detect:
+                    remove_code_count += 1
+                updated_content.append(new_content)
+
+            elif self.license_remove:
+                new_content, detect = remove_license(content)
+                if detect:
+                    remove_code_count += 1
+                updated_content.append(new_content)
+
+            else:
+                return [table], {"Removed code count": remove_code_count}
+
+        updated_content = pa.array(updated_content)
+
+        table = table.set_column(table.column_names.index(self.column_name), self.column_name, updated_content)
+
+        return [table], {"Removed code count": remove_code_count}
+
+
+class HeaderCleanserTransformConfiguration(TransformConfiguration):
+    def __init__(self):
+        super().__init__(name="header_cleanser", transform_class=HeaderCleanserTransform)
+
+    def add_input_params(self, parser: ArgumentParser) -> None:
+        parser.add_argument(
+            f"--{column_cli_params}",
+            required=False,
+            type=str,
+            default=f"{DEFAULT_COLUMN}",
+            help="Name of the column holds the data to process",
+        )
+        parser.add_argument(
+            f"--{license_cli_params}",
+            required=False,
+            type=lambda x: bool(str2bool(x)),
+            default=f"{DEFAULT_LICENSE}",
+            help="Set False if license should not be removed",
+        )
+        parser.add_argument(
+            f"--{copyright_cli_params}",
+            required=False,
+            type=lambda x: bool(str2bool(x)),
+            default=f"{DEFAULT_COPYRIGHT}",
+            help="Set False if copyright should not be removed ",
+        )
+
+    def apply_input_params(self, args: Namespace) -> bool:
+        captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+        self.params = self.params | captured
+        return True
+
+
+class HeaderCleanserPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+    def __init__(self):
+        super().__init__(transform_config=HeaderCleanserTransformConfiguration())
diff --git a/tests/tests/python-code-header-cleanser/pail/src/main.py b/tests/tests/python-code-header-cleanser/pail/src/main.py
@@ -0,0 +1,48 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import os
+import sys
+
+import pyarrow.parquet as pq
+
+from header_cleanser_transform import (
+    COLUMN_KEY,
+    COPYRIGHT_KEY,
+    LICENSE_KEY,
+    HeaderCleanserTransform,
+)
+
+header_cleanser_params = {
+    COLUMN_KEY: "contents",
+    COPYRIGHT_KEY: True,
+    LICENSE_KEY: True,
+}
+
+if __name__ == "__main__":
+    # Create and configure the transform.
+    transform = HeaderCleanserTransform(header_cleanser_params)
+
+    try:
+        print(f"Reading in parquet file {sys.argv[1]}")
+        table = pq.read_table(sys.argv[1])
+    except Exception as e:
+        print(f"Error reading table: {e}", file=sys.stderr)
+        exit(1)
+        print(f"Done Reading in parquet file {sys.argv[1]}")
+
+    print(f"input table has {table.num_rows} rows")
+    # Transform the table
+    table_list, metadata = transform.transform(table)
+    print(f"\noutput table has {table_list[0].num_rows} rows")
+    print(f"output metadata : {metadata}")
+    pq.write_table(table_list[0], sys.argv[2])
diff --git a/tests/tests/python-code-header-cleanser/pail/test-data/expected/metadata.json b/tests/tests/python-code-header-cleanser/pail/test-data/expected/metadata.json
@@ -0,0 +1,44 @@
+{
+    "pipeline": "pipeline_id",
+    "job details": {
+        "job category": "preprocessing",
+        "job name": "header_cleanser",
+        "job type": "pure python",
+        "job id": "job_id",
+        "start_time": "2024-06-16 09:05:43",
+        "end_time": "2024-06-16 09:05:50",
+        "status": "success"
+    },
+    "code": {
+        "github": "github",
+        "commit_hash": "12345",
+        "path": "path"
+    },
+    "job_input_params": {
+        "contents_column_name": "contents",
+        "license": "true",
+        "copyright": "true",
+        "checkpointing": false,
+        "max_files": -1,
+        "random_samples": -1,
+        "files_to_use": [".parquet"]
+    },
+    "job_output_stats": {
+        "source_files": 1,
+        "source_size": 17466,
+        "result_files": 1,
+        "result_size": 38953,
+        "processing_time": 7.257367134094238,
+        "Removed code count": 9,
+        "source_doc_count": 10,
+        "result_doc_count": 10
+    },
+    "source": {
+        "name": "/home/yash/git_fork_3/data-prep-kit/transforms/code/header_cleanser/python/test-data/input",
+        "type": "path"
+    },
+    "target": {
+        "name": "/home/yash/git_fork_3/data-prep-kit/transforms/code/header_cleanser/python/test-data/expected/license-and-copyright",
+        "type": "path"
+    }
+}
diff --git a/tests/tests/python-code-header-cleanser/pail/test-data/expected/test1.parquet.gz b/tests/tests/python-code-header-cleanser/pail/test-data/expected/test1.parquet.gz
diff --git a/tests/tests/python-code-header-cleanser/pail/test-data/input/test1.parquet.gz b/tests/tests/python-code-header-cleanser/pail/test-data/input/test1.parquet.gz
diff --git a/tests/tests/python-code-header-cleanser/post.sh b/tests/tests/python-code-header-cleanser/post.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+DATA="$TEST_PATH"/pail/test-data
+
+function validate {
+    actual="$1"
+    expected="$2"
+
+    while true
+    do
+        if [ -f $actual ]
+        then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break
+        else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1
+        fi
+    done
+
+#    actual_sha256=$(cat "$actual" | sha256sum)
+#    expected_sha256=$(gunzip -c "$expected" | sha256sum)
+#
+#    if [ "$actual_sha256" = "$expected_sha256" ]
+#    then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME"
+#    else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1
+#    fi
+
+    rm -f "$actual"
+}
+
+validate task.1.output.txt "$DATA"/expected/test1.parquet.gz
diff --git a/tests/tests/python-code-header-cleanser/settings.sh b/tests/tests/python-code-header-cleanser/settings.sh
@@ -0,0 +1,6 @@
+api=workqueue
+
+expected=("input table has 5 rows" "output table has 5 rows")
+NUM_DESIRED_OUTPUTS=0
+
+up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/input/test1.parquet.gz)'
diff --git a/tests/tests/python-code-header-cleanser/target b/tests/tests/python-code-header-cleanser/target
@@ -0,0 +1 @@
+../python-universal-tokenization/target