-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
test: add python-code-header-cleanser test
Signed-off-by: aavarghese <avarghese@us.ibm.com>
- Loading branch information
1 parent
a85ac34
commit c9e366c
Showing
12 changed files
with
388 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
test-data/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
apiVersion: lunchpail.io/v1alpha1 | ||
kind: Application | ||
metadata: | ||
name: header_cleanser | ||
spec: | ||
role: worker | ||
image: docker.io/python:3.12 | ||
command: python3 ./main.py | ||
needs: | ||
- name: python | ||
version: latest | ||
requirements: | | ||
{{ .Files.Get "data/requirements.txt" | indent 8 }} | ||
code: | ||
- name: main.py | ||
source: | | ||
{{ .Files.Get "src/main.py" | indent 8 }} | ||
- name: header_cleanser_transform.py | ||
source: | | ||
{{ .Files.Get "src/header_cleanser_transform.py" | indent 8 }} |
5 changes: 5 additions & 0 deletions
5
tests/tests/python-code-header-cleanser/pail/data/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
data-prep-toolkit==0.2.2.dev1 | ||
scancode-toolkit[full] | ||
|
||
pyarrow | ||
setuptools |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
{{- range until (.Values.pools | default 1 | int) }} | ||
--- | ||
apiVersion: lunchpail.io/v1alpha1 | ||
kind: WorkerPool | ||
metadata: | ||
name: {{ print "pool" (add 1 .) }} | ||
spec: | ||
workers: | ||
count: {{ $.Values.workers | default 1 }} | ||
size: {{ $.Values.size | default "xxs" }} | ||
{{- end }} |
224 changes: 224 additions & 0 deletions
224
tests/tests/python-code-header-cleanser/pail/src/header_cleanser_transform.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,224 @@ | ||
# Licensed under the Apache License, Version 2.0 (the “License”); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an “AS IS” BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
################################################################################ | ||
|
||
import os | ||
import tempfile | ||
from argparse import ArgumentParser, Namespace | ||
|
||
import pyarrow as pa | ||
from data_processing.runtime.pure_python.runtime_configuration import ( | ||
PythonTransformRuntimeConfiguration, | ||
) | ||
from data_processing.transform import AbstractTableTransform, TransformConfiguration | ||
from data_processing.utils import CLIArgumentProvider, get_logger, str2bool | ||
from scancode import api | ||
|
||
|
||
logger = get_logger(__name__) | ||
|
||
short_name = "header_cleanser" | ||
cli_prefix = short_name + "_" | ||
COLUMN_KEY = "contents_column_name" | ||
LICENSE_KEY = "license" | ||
COPYRIGHT_KEY = "copyright" | ||
|
||
column_cli_params = f"{cli_prefix}{COLUMN_KEY}" | ||
license_cli_params = f"{cli_prefix}{LICENSE_KEY}" | ||
copyright_cli_params = f"{cli_prefix}{COPYRIGHT_KEY}" | ||
|
||
DEFAULT_COLUMN = "contents" | ||
DEFAULT_LICENSE = True | ||
DEFAULT_COPYRIGHT = True | ||
|
||
|
||
def file_generate(content): | ||
""" | ||
Generate temporary file so that it can be passed to scancode-toolkit. | ||
""" | ||
try: | ||
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file: | ||
temp_file.write(content.encode("utf-8")) | ||
temp_file_path = temp_file.name | ||
except Exception as e: | ||
print(f"Failed to create file : {e}") | ||
return temp_file_path | ||
|
||
|
||
def fetch_index(dict_data): | ||
""" | ||
Extract License and copyright start and endline from dictonary | ||
""" | ||
ignore_lines = [] | ||
if dict_data.get("license_detections") != None: | ||
for licenses in dict_data.get("license_detections"): | ||
for match in licenses.get("matches"): | ||
start_line = match["start_line"] - 1 | ||
end_line = match["end_line"] - 1 | ||
ignore_lines.extend([i for i in range(start_line, end_line + 1)]) | ||
|
||
if dict_data.get("copyrights") != None: | ||
for copyrights in dict_data.get("copyrights"): | ||
start_line = copyrights.get("start_line") - 1 | ||
end_line = copyrights.get("end_line") - 1 | ||
ignore_lines.extend([i for i in range(start_line, end_line + 1)]) | ||
|
||
return ignore_lines | ||
|
||
|
||
def check_empty_comment(code, ignore_lines): | ||
min_index = min(ignore_lines) | ||
max_index = max(ignore_lines) | ||
code_list = code.split("\n") | ||
if min_index != 0: | ||
min_index = min_index - 1 | ||
|
||
if max_index <= len(code_list): | ||
max_index = max_index + 2 | ||
|
||
for index in range(min_index, max_index): | ||
if all( | ||
not isinstance(x, (int, float, complex)) | ||
and not isinstance(x, str) | ||
or (isinstance(x, str) and not x.isalnum()) | ||
for x in code_list[index] | ||
): | ||
if index not in ignore_lines: | ||
ignore_lines.append(index) | ||
|
||
return ignore_lines | ||
|
||
|
||
def remove_copyright(code): | ||
""" | ||
Using scancode.api function to detecte and remove copyright. | ||
""" | ||
file_path = file_generate(content=code) | ||
copyright_dict = api.get_copyrights(file_path) | ||
os.remove(file_path) | ||
ignore_lines = fetch_index(copyright_dict) | ||
if ignore_lines != []: | ||
modified_code = "\n".join([line for i, line in enumerate(code.split("\n"), 0) if i not in ignore_lines]) | ||
return modified_code, ignore_lines != [] | ||
else: | ||
return code, False | ||
|
||
|
||
def remove_license(code): | ||
""" | ||
Using scancode.api function to detecte and remove license. | ||
""" | ||
file_path = file_generate(content=code) | ||
license_dict = api.get_licenses(file_path) | ||
os.remove(file_path) | ||
ignore_lines = fetch_index(license_dict) | ||
if ignore_lines != []: | ||
modified_code = "\n".join([line for i, line in enumerate(code.split("\n"), 0) if i not in ignore_lines]) | ||
return modified_code, ignore_lines != [] | ||
else: | ||
return code, False | ||
|
||
|
||
def remove_license_copyright(code): | ||
|
||
file_path = file_generate(code) | ||
copyright_dict = api.get_copyrights(file_path) | ||
license_dict = api.get_licenses(file_path) | ||
os.remove(file_path) | ||
ignore_lines_license = fetch_index(license_dict) | ||
ignore_lines_copyright = fetch_index(copyright_dict) | ||
ignore_lines = ignore_lines_license + ignore_lines_copyright | ||
if ignore_lines != []: | ||
ignore_lines = check_empty_comment(code, ignore_lines) | ||
modified_code = "\n".join([line for i, line in enumerate(code.split("\n"), 0) if i not in ignore_lines]) | ||
return modified_code, True | ||
else: | ||
return code, False | ||
|
||
|
||
class HeaderCleanserTransform(AbstractTableTransform): | ||
def __init__(self, config: dict): | ||
super().__init__(config) | ||
|
||
self.column_name = config.get(COLUMN_KEY, DEFAULT_COLUMN) | ||
self.license_remove = config.get(LICENSE_KEY, DEFAULT_LICENSE) | ||
self.copyright_remove = config.get(COPYRIGHT_KEY, DEFAULT_COPYRIGHT) | ||
|
||
def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Table], dict]: | ||
|
||
contents = table.column(self.column_name).to_pylist() | ||
updated_content = [] | ||
remove_code_count = 0 | ||
for content in contents: | ||
if self.license_remove and self.copyright_remove: | ||
new_content, detect = remove_license_copyright(content) | ||
if detect: | ||
remove_code_count += 1 | ||
updated_content.append(new_content) | ||
|
||
elif self.copyright_remove: | ||
new_content, detect = remove_copyright(content) | ||
if detect: | ||
remove_code_count += 1 | ||
updated_content.append(new_content) | ||
|
||
elif self.license_remove: | ||
new_content, detect = remove_license(content) | ||
if detect: | ||
remove_code_count += 1 | ||
updated_content.append(new_content) | ||
|
||
else: | ||
return [table], {"Removed code count": remove_code_count} | ||
|
||
updated_content = pa.array(updated_content) | ||
|
||
table = table.set_column(table.column_names.index(self.column_name), self.column_name, updated_content) | ||
|
||
return [table], {"Removed code count": remove_code_count} | ||
|
||
|
||
class HeaderCleanserTransformConfiguration(TransformConfiguration): | ||
def __init__(self): | ||
super().__init__(name="header_cleanser", transform_class=HeaderCleanserTransform) | ||
|
||
def add_input_params(self, parser: ArgumentParser) -> None: | ||
parser.add_argument( | ||
f"--{column_cli_params}", | ||
required=False, | ||
type=str, | ||
default=f"{DEFAULT_COLUMN}", | ||
help="Name of the column holds the data to process", | ||
) | ||
parser.add_argument( | ||
f"--{license_cli_params}", | ||
required=False, | ||
type=lambda x: bool(str2bool(x)), | ||
default=f"{DEFAULT_LICENSE}", | ||
help="Set False if license should not be removed", | ||
) | ||
parser.add_argument( | ||
f"--{copyright_cli_params}", | ||
required=False, | ||
type=lambda x: bool(str2bool(x)), | ||
default=f"{DEFAULT_COPYRIGHT}", | ||
help="Set False if copyright should not be removed ", | ||
) | ||
|
||
def apply_input_params(self, args: Namespace) -> bool: | ||
captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) | ||
self.params = self.params | captured | ||
return True | ||
|
||
|
||
class HeaderCleanserPythonTransformConfiguration(PythonTransformRuntimeConfiguration): | ||
def __init__(self): | ||
super().__init__(transform_config=HeaderCleanserTransformConfiguration()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# (C) Copyright IBM Corp. 2024. | ||
# Licensed under the Apache License, Version 2.0 (the “License”); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an “AS IS” BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
################################################################################ | ||
|
||
import os | ||
import sys | ||
|
||
import pyarrow.parquet as pq | ||
|
||
from header_cleanser_transform import ( | ||
COLUMN_KEY, | ||
COPYRIGHT_KEY, | ||
LICENSE_KEY, | ||
HeaderCleanserTransform, | ||
) | ||
|
||
header_cleanser_params = { | ||
COLUMN_KEY: "contents", | ||
COPYRIGHT_KEY: True, | ||
LICENSE_KEY: True, | ||
} | ||
|
||
if __name__ == "__main__": | ||
# Create and configure the transform. | ||
transform = HeaderCleanserTransform(header_cleanser_params) | ||
|
||
try: | ||
print(f"Reading in parquet file {sys.argv[1]}") | ||
table = pq.read_table(sys.argv[1]) | ||
except Exception as e: | ||
print(f"Error reading table: {e}", file=sys.stderr) | ||
exit(1) | ||
print(f"Done Reading in parquet file {sys.argv[1]}") | ||
|
||
print(f"input table has {table.num_rows} rows") | ||
# Transform the table | ||
table_list, metadata = transform.transform(table) | ||
print(f"\noutput table has {table_list[0].num_rows} rows") | ||
print(f"output metadata : {metadata}") | ||
pq.write_table(table_list[0], sys.argv[2]) |
44 changes: 44 additions & 0 deletions
44
tests/tests/python-code-header-cleanser/pail/test-data/expected/metadata.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
{ | ||
"pipeline": "pipeline_id", | ||
"job details": { | ||
"job category": "preprocessing", | ||
"job name": "header_cleanser", | ||
"job type": "pure python", | ||
"job id": "job_id", | ||
"start_time": "2024-06-16 09:05:43", | ||
"end_time": "2024-06-16 09:05:50", | ||
"status": "success" | ||
}, | ||
"code": { | ||
"github": "github", | ||
"commit_hash": "12345", | ||
"path": "path" | ||
}, | ||
"job_input_params": { | ||
"contents_column_name": "contents", | ||
"license": "true", | ||
"copyright": "true", | ||
"checkpointing": false, | ||
"max_files": -1, | ||
"random_samples": -1, | ||
"files_to_use": [".parquet"] | ||
}, | ||
"job_output_stats": { | ||
"source_files": 1, | ||
"source_size": 17466, | ||
"result_files": 1, | ||
"result_size": 38953, | ||
"processing_time": 7.257367134094238, | ||
"Removed code count": 9, | ||
"source_doc_count": 10, | ||
"result_doc_count": 10 | ||
}, | ||
"source": { | ||
"name": "/home/yash/git_fork_3/data-prep-kit/transforms/code/header_cleanser/python/test-data/input", | ||
"type": "path" | ||
}, | ||
"target": { | ||
"name": "/home/yash/git_fork_3/data-prep-kit/transforms/code/header_cleanser/python/test-data/expected/license-and-copyright", | ||
"type": "path" | ||
} | ||
} |
Binary file added
BIN
+13.9 KB
tests/tests/python-code-header-cleanser/pail/test-data/expected/test1.parquet.gz
Binary file not shown.
Binary file added
BIN
+14.7 KB
tests/tests/python-code-header-cleanser/pail/test-data/input/test1.parquet.gz
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/usr/bin/env bash | ||
|
||
DATA="$TEST_PATH"/pail/test-data | ||
|
||
function validate { | ||
actual="$1" | ||
expected="$2" | ||
|
||
while true | ||
do | ||
if [ -f $actual ] | ||
then echo "✅ PASS found local task output file=$actual test=$TEST_NAME" && break | ||
else echo "Still waiting for local task output file=$actual test=$TEST_NAME" && sleep 1 | ||
fi | ||
done | ||
|
||
# actual_sha256=$(cat "$actual" | sha256sum) | ||
# expected_sha256=$(gunzip -c "$expected" | sha256sum) | ||
# | ||
# if [ "$actual_sha256" = "$expected_sha256" ] | ||
# then echo "✅ PASS the output file is valid file=$actual test=$TEST_NAME" | ||
# else echo "❌ FAIL mismatched sha256 on output file file=$actual actual_sha256=$actual_sha256 expected_sha256=$expected_sha256 test=$TEST_NAME" && exit 1 | ||
# fi | ||
|
||
rm -f "$actual" | ||
} | ||
|
||
validate task.1.output.txt "$DATA"/expected/test1.parquet.gz |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
api=workqueue | ||
|
||
expected=("input table has 5 rows" "output table has 5 rows") | ||
NUM_DESIRED_OUTPUTS=0 | ||
|
||
up_args='<(gunzip -c "$TEST_PATH"/pail/test-data/input/test1.parquet.gz)' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
../python-universal-tokenization/target |