From 40859dd145e0a437c1380af49fe5fdbf8d1838d5 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 9 Aug 2020 21:19:40 -0400
Subject: [PATCH 001/116] first working implementation
---
pilsner/__init__.py | 2 +
pilsner/model.py | 215 +++++++++++++++
pilsner/utility.py | 421 ++++++++++++++++++++++++++++++
scripts/linux/vscode.sh | 14 +
scripts/win/vscode.bat | 12 +
test/assets/sample_dictionary.txt | 7 +
test/assets/tokenizer1.xml | 5 +
test/assets/tokenizer2.xml | 5 +
test/sandbox.py | 52 ++++
9 files changed, 733 insertions(+)
create mode 100644 pilsner/__init__.py
create mode 100644 pilsner/model.py
create mode 100644 pilsner/utility.py
create mode 100755 scripts/linux/vscode.sh
create mode 100644 scripts/win/vscode.bat
create mode 100644 test/assets/sample_dictionary.txt
create mode 100644 test/assets/tokenizer1.xml
create mode 100644 test/assets/tokenizer2.xml
create mode 100644 test/sandbox.py
diff --git a/pilsner/__init__.py b/pilsner/__init__.py
new file mode 100644
index 0000000..ebb1491
--- /dev/null
+++ b/pilsner/__init__.py
@@ -0,0 +1,2 @@
+from .model import Model
+from .utility import Recognizer
diff --git a/pilsner/model.py b/pilsner/model.py
new file mode 100644
index 0000000..3dd51fa
--- /dev/null
+++ b/pilsner/model.py
@@ -0,0 +1,215 @@
+import os
+import logging
+import random
+import string
+import sqlite3
+import sic
+import pickle
+import shutil
+
+class Model(dict):
+
+ CONTENT_KEY = '~content'
+ SPECS_KEY = '~specs'
+ COMPRESSED_KEY = '~compressed'
+ TOKENIZER_OPTION_KEY = '~tokenizer_option'
+ WORD_SEPARATOR_KEY = '~word_separator'
+ ENTITY_KEY = '~i'
+ ATTRS_KEY = '~p'
+ INTERNAL_ID_KEY = '~iid'
+ DICTIONARY_KEY = '~dictionary'
+ KEYWORDS_KEY = '~keywords'
+ NORMALIZER_KEY = '~normalization'
+ DEFAULT_NORMALIZER_KEY = '~default_normalizer'
+ DATASOURCE_KEY = '~datasource'
+
+ DEFAULT_DATASOURCE_PATH = '.'
+ DEFAULT_DATASOURCE_FILENAME = ''
+ DEFAULT_DATASOURCE = ''
+
+ DEFAULT_WORD_SEPARATOR = ' '
+ DEFAULT_TOKENIZER_OPTION = 0
+
+ def __init__(self, filename='', storage_location='', debug_mode=False, verbose_mode=False):
+ self.DEFAULT_DATASOURCE_FILENAME = storage_location
+ if self.DEFAULT_DATASOURCE_FILENAME.lower() != ':memory:':
+ while self.DEFAULT_DATASOURCE_FILENAME == '' or os.path.exists(self.DEFAULT_DATASOURCE):
+ self.DEFAULT_DATASOURCE_FILENAME = '.%s' % (''.join(random.choice(string.ascii_letters) for i in range(7)))
+ self.DEFAULT_DATASOURCE = '%s/%s' % (self.DEFAULT_DATASOURCE_PATH, self.DEFAULT_DATASOURCE_FILENAME)
+ else:
+ self.DEFAULT_DATASOURCE = ':memory:'
+ self[self.NORMALIZER_KEY] = {}
+ self[self.DEFAULT_NORMALIZER_KEY] = ''
+ self[self.DICTIONARY_KEY] = []
+ self[self.KEYWORDS_KEY] = {}
+ self[self.DATASOURCE_KEY] = self.DEFAULT_DATASOURCE
+ self[self.WORD_SEPARATOR_KEY] = self.DEFAULT_WORD_SEPARATOR
+ self[self.TOKENIZER_OPTION_KEY] = self.DEFAULT_TOKENIZER_OPTION
+ self.connection = sqlite3.connect(self[self.DATASOURCE_KEY])
+ self.cursor = self.connection.cursor()
+ self.normalizer_map = {}
+ self.sic_builder = sic.Builder(debug_mode=debug_mode, verbose_mode=verbose_mode)
+ if filename != '':
+ self.load(filename)
+
+ def __del__(self):
+ # remove all temporary resources
+ self.connection.close()
+ if os.path.exists(self.DEFAULT_DATASOURCE):
+ os.remove(self.DEFAULT_DATASOURCE)
+
+ def save(self, filename):
+ assert os.path.exists(self[self.DATASOURCE_KEY]), 'Cannot find temporary database on disk'
+ logging.debug('Saving model "%s"' % (filename))
+ self.cursor.close()
+ self.connection.close()
+ normalizers = {
+ self.DEFAULT_NORMALIZER_KEY: self[self.DEFAULT_NORMALIZER_KEY],
+ self.WORD_SEPARATOR_KEY: self[self.WORD_SEPARATOR_KEY],
+ self.TOKENIZER_OPTION_KEY: self[self.TOKENIZER_OPTION_KEY],
+ self.NORMALIZER_KEY: {normalizer_name: self[self.NORMALIZER_KEY][normalizer_name].data for normalizer_name in self[self.NORMALIZER_KEY]}
+ }
+ with open('%s.normalizers' % (filename), mode='wb') as f:
+ pickle.dump(normalizers, f)
+ logging.debug('Saved "%s"' % ('%s.normalizers' % (filename)))
+ for dictionary_number in range(len(self[self.DICTIONARY_KEY])):
+ with open('%s.%d.dictionary' % (filename, dictionary_number), mode='wb') as f:
+ pickle.dump(self[self.DICTIONARY_KEY][dictionary_number], f)
+ logging.debug('Saved "%s"' % ('%s.%d.dictionary' % (filename, dictionary_number)))
+ with open('%s.keywords' % (filename), mode='wb') as f:
+ pickle.dump(self[self.KEYWORDS_KEY], f)
+ logging.debug('Saved "%s"' % ('%s.keywords' % (filename)))
+ shutil.copyfile(self[self.DATASOURCE_KEY], '%s.attributes' % (filename))
+ logging.debug('Saved "%s"' % ('%s.attributes' % (filename)))
+ self.connection = sqlite3.connect(self[self.DATASOURCE_KEY])
+ self.cursor = self.connection.cursor()
+ logging.debug('Saved "%s"' % (filename))
+ return True
+
+ def load(self, filename):
+ logging.debug('Loading model "%s"' % (filename))
+ self[self.DATASOURCE_KEY] = '%s.attributes' % (filename)
+ self.cursor.close()
+ self.connection.close()
+ with open('%s.normalizers' % (filename), mode='rb') as f:
+ normalizers = pickle.load(f)
+ for normalizer_name in normalizers[self.NORMALIZER_KEY]:
+ self[self.NORMALIZER_KEY][normalizer_name] = self.sic_builder.build_normalizer()
+ self[self.NORMALIZER_KEY][normalizer_name].data = normalizers[self.NORMALIZER_KEY][normalizer_name]
+ self[self.WORD_SEPARATOR_KEY] = normalizers[self.WORD_SEPARATOR_KEY]
+ self[self.TOKENIZER_OPTION_KEY] = normalizers[self.TOKENIZER_OPTION_KEY]
+ self[self.DEFAULT_NORMALIZER_KEY] = normalizers[self.DEFAULT_NORMALIZER_KEY]
+ logging.debug('Loaded "%s"' % ('%s.normalizers' % (filename)))
+ for _filename in os.listdir(os.path.dirname(filename)) if os.path.dirname(filename) != '' else os.listdir():
+ if _filename.startswith(os.path.basename(filename) + '.') and _filename.endswith('.dictionary'):
+ with open('%s/%s' % (os.path.dirname(filename), _filename) if os.path.dirname(filename) != '' else _filename, mode='rb') as f:
+ dictionary = pickle.load(f)
+ self[self.DICTIONARY_KEY].append(dictionary)
+ logging.debug('Loaded "%s"' % ('%s/%s' % (os.path.dirname(filename), _filename) if os.path.dirname(filename) != '' else _filename))
+ with open('%s.keywords' % (filename), mode='rb') as f:
+ keywords = pickle.load(f)
+ self[self.KEYWORDS_KEY] = keywords
+ logging.debug('Loaded "%s"' % ('%s.keywords' % (filename)))
+ self[self.DATASOURCE_KEY] = '%s.attributes' % (filename)
+ self.connection = sqlite3.connect(self[self.DATASOURCE_KEY])
+ self.cursor = self.connection.cursor()
+ return True
+
+ def add_normalizer(self, normalizer_name, filename, default=False):
+ logging.debug('Adding normalizer "%s" from "%s"' % (normalizer_name, filename))
+ normalizer = self.sic_builder.build_normalizer(filename)
+ self[self.NORMALIZER_KEY][normalizer_name] = normalizer
+ if len(self[self.NORMALIZER_KEY]) == 1 or default:
+ self[self.DEFAULT_NORMALIZER_KEY] = normalizer_name
+ logging.debug('Added normalizer "%s" from "%s"' % (normalizer_name, filename))
+ return True
+
+ def create_recognizer_schema(self, cursor):
+ logging.debug('Creating schema for permanent storage')
+ cursor.execute('create table attrs (n integer, iid integer, attr_name text, attr_value text);')
+ logging.debug('Created schema for permanent storage')
+
+ def pack_subtrie(self, trie, compressed, prefix):
+ if not compressed:
+ return trie, prefix
+ if type(trie) != dict:
+ return trie, prefix
+ if prefix == self.ENTITY_KEY:
+ return trie, prefix
+ children = trie
+ child_count = len(children)
+ if child_count == 1:
+ for key, child in children.items():
+ if key == self.ENTITY_KEY:
+ if len(prefix) > 1:
+ return {prefix[1:]: trie}, prefix[0]
+ return trie, prefix
+ next_prefix = prefix + key
+ comp_child, comp_key = self.pack_subtrie(child, compressed, next_prefix)
+ if prefix == '':
+ comp_children = {comp_key: comp_child}
+ else:
+ comp_children = comp_child
+ return comp_children, comp_key
+ else:
+ comp_children = {}
+ for key, child in children.items():
+ comp_child, comp_key = self.pack_subtrie(child, compressed, key)
+ comp_children[comp_key] = comp_child
+ if len(prefix) > 1:
+ comp_children = {prefix[0]: {prefix[1:]: comp_children}}
+ return comp_children[prefix[0]], prefix[0]
+ return comp_children, prefix
+
+ def pack_trie(self, trie, compressed):
+ ret = {k: trie[k] for k in trie if k != self.CONTENT_KEY}
+ ret[self.CONTENT_KEY] = {}
+ for normalizer_name in trie[self.CONTENT_KEY]:
+ packed = self.pack_subtrie(trie[self.CONTENT_KEY][normalizer_name], compressed, '')[0]
+ ret[self.CONTENT_KEY][normalizer_name] = packed
+ return ret
+
+ def attribute_wrapper(self, line_number, normalizer_name, internal_id, subtrie, trie, specs, columns):
+ if self.ENTITY_KEY not in subtrie:
+ subtrie[self.ENTITY_KEY] = []
+ subtrie[self.ENTITY_KEY].append(line_number)
+ for k in specs['fields']:
+ if specs['fields'][k][3]:
+ continue
+ if not specs['fields'][k][1]:
+ self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, columns[specs['fields'][k][0]]))
+ else:
+ _ = [ self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, s)) for s in set(columns[specs['fields'][k][0]].split( specs['fields'][k][1]) ) ]
+
+ def get_dictionary_line(self, specs, entity_ids, line_numbers, line_number, line, column_separator, cell_wall):
+ columns = [x.strip(cell_wall) for x in line.split(column_separator)]
+ if line_number in line_numbers:
+ internal_id = line_numbers[line_number]
+ else:
+ entity_id = columns[specs['id'][0]]
+ if entity_id not in entity_ids:
+ entity_ids[entity_id] = len(entity_ids)
+ internal_id = entity_ids[entity_id]
+ return columns, internal_id
+
+ def get_dictionary_synonym(self, columns, specs, word_separator, tokenizer_option=0):
+ synonym, normalizer_name = columns[specs['value'][0]], None
+ if self[self.NORMALIZER_KEY]:
+ if specs['tokenizer']:
+ if columns[specs['tokenizer'][0]] not in self.normalizer_map:
+ normalizer_name = self[self.DEFAULT_NORMALIZER_KEY]
+ elif columns[specs['tokenizer'][0]] in self.normalizer_map and self.normalizer_map[columns[specs['tokenizer'][0]]] in self[self.NORMALIZER_KEY]:
+ normalizer_name = self.normalizer_map[columns[specs['tokenizer'][0]]]
+ if normalizer_name is not None:
+ synonym = self[self.NORMALIZER_KEY][normalizer_name].normalize(synonym, word_separator, tokenizer_option)
+ return synonym, normalizer_name
+
+ def next_trie(self, specs, compressed, tokenizer_option, word_separator):
+ new_trie = {
+ self.CONTENT_KEY: {normalizer_name: {} for normalizer_name in self[self.NORMALIZER_KEY]},
+ self.SPECS_KEY: specs,
+ self.COMPRESSED_KEY: int(compressed),
+ self.TOKENIZER_OPTION_KEY: tokenizer_option,
+ self.WORD_SEPARATOR_KEY: word_separator
+ }
+ return new_trie
diff --git a/pilsner/utility.py b/pilsner/utility.py
new file mode 100644
index 0000000..aa2c1fb
--- /dev/null
+++ b/pilsner/utility.py
@@ -0,0 +1,421 @@
+import logging
+import sqlite3
+import os
+
+class Recognizer():
+
+ OPERATIONAL_STORAGE = ''
+ PERMANENT_STORAGE = ''
+
+ def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, callback_progress=None, permanent_storage='', operational_storage=':memory:'):
+ logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
+ self.debug = debug_mode
+ self.verbose = verbose_mode
+ self.logger = logging.info if self.verbose else logging.debug
+ if self.verbose:
+ logging.root.setLevel(logging.INFO)
+ if self.debug:
+ logging.root.setLevel(logging.DEBUG)
+ self.logger('Debug mode is on')
+ self.callback_status = callback_status
+ self.callback_progress = callback_progress
+ self.PERMANENT_STORAGE = permanent_storage
+ self.OPERATIONAL_STORAGE = operational_storage
+ self.o_connection = sqlite3.connect(self.OPERATIONAL_STORAGE)
+ self.o_cursor = self.o_connection.cursor()
+ self.o_cursor.execute('create table if not exists flatten (l integer, r integer, attr_name text, attr_value text);')
+ self.o_cursor.execute('delete from flatten;')
+ self.o_connection.commit()
+ logging.debug('Recognizer class has been initialized')
+
+ def __del__(self):
+ # remove all temporary resources
+ self.o_connection.close()
+ if os.path.exists(self.OPERATIONAL_STORAGE):
+ os.remove(self.OPERATIONAL_STORAGE)
+
+ def push_message(self, message, callback_function):
+ if callback_function is not None:
+ callback_function(message)
+
+ def compile_dict_specs(self, fields):
+ logging.debug('Compiling specs')
+ specs = {'fields': {}, 'id': None, 'tokenizer': None, 'value': None}
+ # {'name': 'DType', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
+ # specs = {'DType': (0, None, False, True, False), 'MSID': (1, None, True, False, False), 'value': (2, None, False, False, True)}
+ for i in range(0, len(fields)):
+ field = fields[i]
+ if not field['include']:
+ continue
+ specs['fields'][field['name']] = (i, field['delimiter'], field['normalizer_flag'], field['value_flag'])
+ if field['id_flag']:
+ specs['id'] = specs['fields'][field['name']]
+ if field['normalizer_flag']:
+ specs['tokenizer'] = specs['fields'][field['name']]
+ if field['value_flag']:
+ specs['value'] = specs['fields'][field['name']]
+ logging.debug('Done compiling specs')
+ return specs
+
+ def make_recognizer(self, model, filename, specs, word_separator, item_limit, compressed, column_separator, cell_wall, tokenizer_option):
+ # TODO: review for refactoring
+ self.logger('Making recognizer using %s' % (filename))
+ self.push_message('Making recognizer using %s' % (filename), self.callback_status)
+ entity_ids = {}
+ line_numbers = {}
+ total_bytes = os.path.getsize(filename) + 1
+ increment_bytes = int(total_bytes / 100) if total_bytes > 100 else total_bytes
+ this_progress_position = 0
+ last_progress_position = 0
+ rows = model.cursor.execute('select 0 where not exists (select name from sqlite_master where type = \'table\' and name = \'attrs\');')
+ for _ in rows:
+ model.create_recognizer_schema(model.cursor)
+ break
+ with open(filename, mode='r', encoding='utf8') as f:
+ ret = []
+ line_count = 0
+ line_number = 0
+ chars_read = 0
+ trie = model.next_trie(specs, compressed, tokenizer_option, word_separator)
+ for line in f:
+ chars_read += len(line)
+ this_progress_position = int(chars_read / increment_bytes)
+ if this_progress_position != last_progress_position:
+ last_progress_position = this_progress_position
+ self.push_message(int(100 * chars_read / total_bytes), self.callback_progress)
+ if item_limit > 0 and line_count == item_limit:
+ packed = model.pack_trie(trie, compressed)
+ ret.append(packed)
+ trie = model.next_trie(specs, compressed, tokenizer_option, word_separator)
+ self.logger('Lines read: %d' % (line_count))
+ line_count = 0
+ columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_number, line, column_separator, cell_wall)
+ synonym, normalizer_name = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
+ subtrie = trie[model.CONTENT_KEY][normalizer_name]
+ for character in synonym:
+ if character not in subtrie:
+ subtrie[character] = {}
+ subtrie = subtrie[character]
+ model.attribute_wrapper(line_number, normalizer_name, internal_id, subtrie, trie, specs, columns)
+ line_count += 1
+ line_number += 1
+ if line_count > 0 and len(trie) > 3:
+ packed = model.pack_trie(trie, compressed)
+ ret.append(packed)
+ self.logger('Lines read: %d' % (line_count))
+ model.connection.commit()
+ model.cursor.execute('create index ix_attrs_n_attr_name_attr_value on attrs (n asc, attr_name asc, attr_value asc);')
+ self.logger('Recognizer completed.')
+ return ret, line_numbers
+
+ def make_keywords(self, model, filename, specs, line_numbers, word_separator, disambiguate_all, column_separator, cell_wall, tokenizer_option):
+ self.logger('Making keywords using %s... ' % (filename))
+ self.push_message('Making keywords from {0}'.format(filename), self.callback_status)
+ total_bytes = os.path.getsize(filename) + 1
+ increment_bytes = int(total_bytes / 100) if total_bytes > 100 else total_bytes
+ this_progress_position = 0
+ last_progress_position = 0
+ entity_ids = {}
+ internal_id_map = {}
+ synonyms = {}
+ with open(filename, mode='r', encoding='utf8') as f:
+ line_count = 0
+ chars_read = 0
+ for line in f:
+ chars_read += len(line)
+ this_progress_position = int(chars_read / increment_bytes)
+ if this_progress_position != last_progress_position:
+ last_progress_position = this_progress_position
+ self.push_message(int(100 * chars_read / total_bytes), self.callback_progress)
+ columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_count, line, column_separator, cell_wall)
+ internal_id_map[line_count] = internal_id
+ synonym, _ = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
+ if synonym not in synonyms:
+ synonyms[synonym] = set()
+ synonyms[synonym].add(internal_id)
+ line_count += 1
+ overlapping_ids = {}
+ for s in synonyms:
+ if len(synonyms[s]) > 1 or disambiguate_all:
+ for internal_id in synonyms[s]:
+ overlapping_ids[internal_id] = set()
+ synonyms.clear()
+ entity_ids.clear()
+ with open(filename, mode='r', encoding='utf8') as f:
+ line_count = 0
+ for line in f:
+ columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_count, line, column_separator, cell_wall)
+ if internal_id in overlapping_ids:
+ synonym, _ = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
+ tokens = synonym.split(word_separator)
+ overlapping_ids[internal_id] = overlapping_ids[internal_id].union(set(tokens))
+ line_count += 1
+ # TODO: only leave tokens unique for a given internal_id
+ keywords = {model.CONTENT_KEY: overlapping_ids, model.INTERNAL_ID_KEY: internal_id_map}
+ self.logger('Done compiling keywords.')
+ return keywords
+
+ def compile_model(self, model, filename, specs, word_separator, column_separator, cell_wall, compressed=True, item_limit=0, tokenizer_option=0, include_keywords=False, disambiguate_all=False):
+ tries, line_numbers = self.make_recognizer(model, filename, specs, word_separator, item_limit, compressed, column_separator, cell_wall, tokenizer_option)
+ keywords = {model.CONTENT_KEY: {}, model.INTERNAL_ID_KEY: {}}
+ if include_keywords:
+ keywords = self.make_keywords(model, filename, specs, line_numbers, word_separator, disambiguate_all, column_separator, cell_wall, tokenizer_option)
+ model[model.DICTIONARY_KEY] = tries
+ model[model.KEYWORDS_KEY] = keywords
+ return True
+
+ def verify_keywords(self, model, recognized, src, word_separator):
+ id_list = [set([model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] for x in rec[0] if x in model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY]]) for rec in recognized]
+ for k in range(len(id_list)):
+ ids = id_list[k]
+ if len(ids) < 2:
+ continue
+ si = 0
+ ei = len(src)
+ if k > 0:
+ si = recognized[k-1][4]
+ if k < len(id_list) - 1:
+ ei = recognized[k+1][3]
+ tokens = src[si:ei]
+ s_tokens = set(tokens.split(word_separator))
+ tmp = {i: model[model.KEYWORDS_KEY][model.CONTENT_KEY][i] if i in model[model.KEYWORDS_KEY][model.CONTENT_KEY] else set() for i in ids}
+ kwd = {i: tmp[i] - tmp[j] for i in tmp for j in tmp if j != i}
+ winner_score = 0
+ winner_id = set()
+ kwd_score = {}
+ for i in kwd:
+ kwd_score[i] = len(kwd[i].intersection(s_tokens))
+ if kwd_score[i] > winner_score:
+ winner_score = kwd_score[i]
+ winner_id.clear()
+ if kwd_score[i] == winner_score:
+ winner_id.add(i)
+ recognized[k] = tuple([[x for x in recognized[k][0] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id]] + [{x: recognized[k][1][x] for x in recognized[k][1] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id}] + list(recognized[k])[2:])
+
+ def unpack_trie(self, model, packed_trie, compressed):
+ """TODO: add docstring here
+ """
+ if not compressed or len(packed_trie) != 1:
+ return packed_trie
+ branches = [k for k in packed_trie.keys() if k not in [model.ENTITY_KEY]]
+ if not branches:
+ return packed_trie
+ radix = branches[0]
+ if len(radix) <= 1:
+ return packed_trie
+ unpacked_trie = {}
+ unpacked_trie_pointer = unpacked_trie
+ for character in radix[:-1]:
+ unpacked_trie_pointer[character] = {}
+ unpacked_trie_pointer = unpacked_trie_pointer[character]
+ unpacked_trie_pointer[radix[-1:]] = packed_trie[radix]
+ return unpacked_trie
+
+ def check_attrs(self, model, trie_leaf, cur, specs, include_query, exclude_query, process_exclude, attrs_out_query):
+ trie_leaf[model.ATTRS_KEY] = self.attribute_unpacker(cur, trie_leaf[model.ENTITY_KEY], include_query, exclude_query, process_exclude, attrs_out_query)
+ if len(trie_leaf[model.ATTRS_KEY]) == 0:
+ return {}
+ return trie_leaf
+
+ def attribute_unpacker(self, cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query):
+ attributes = {}
+
+ include = set()
+ exclude = set()
+ for n in leaf_ids:
+ rows = cur.execute('select distinct n from attrs where n = %d %s;' % (n, include_query))
+ for row in rows:
+ include.add(int(row[0]))
+ if process_exclude:
+ for n in leaf_ids:
+ rows = cur.execute('select distinct n from attrs where n = %d %s;' % (n, exclude_query))
+ for row in rows:
+ exclude.add(int(row[0]))
+
+ ns = include - exclude
+
+ for n in ns:
+ rows = cur.execute('select attr_name, attr_value from attrs where n = %d%s;' % (n, attrs_out_query))
+ if n not in attributes:
+ attributes[n] = {}
+ for row in rows:
+ attr_name, attr_value = str(row[0]), str(row[1])
+ if attr_name not in attributes[n]:
+ attributes[n][attr_name] = []
+ attributes[n][attr_name].append(attr_value)
+ return attributes
+
+ def spot_entities(self, model, source_string, normalizer_name, include_query='', exclude_query='', process_exclude=False, attrs_out_query=''):
+ # TODO: review for refactoring
+ self.logger('Analyzing "%s"... ' % (source_string))
+ rets = []
+ for trie in model[model.DICTIONARY_KEY]:
+ ret = []
+ word_separator = trie[model.WORD_SEPARATOR_KEY]
+ start_index, end_index, string_so_far = -1, 0, ''
+ reading_entity = source_string[0:1] != word_separator
+ trie_is_compressed = bool(trie[model.COMPRESSED_KEY])
+ subtrie = trie[model.CONTENT_KEY][normalizer_name]
+ shorter_alternative = None
+ current_index = 0
+ temporary_index = -1
+ total_length = len(source_string)
+ dictionary_specs = trie[model.SPECS_KEY]['fields'].keys()
+ while current_index < total_length:
+ if len(ret) > 0 and current_index < ret[-1][-1]:
+ current_index = ret[-1][-1]
+ if not reading_entity: # wait for word separator
+ character = source_string[current_index]
+ start_index = current_index
+ if character == word_separator:
+ reading_entity = True
+ end_index = start_index
+ else: # reading entity
+ end_index = current_index
+ character = source_string[current_index]
+ if character == word_separator and model.ENTITY_KEY in subtrie:
+ found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
+ if found_object:
+ identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
+ shorter_alternative = (identified[0], identified[1], string_so_far, start_index + 1, end_index)
+ if character in subtrie:
+ if character == word_separator and temporary_index == -1:
+ temporary_index = current_index
+ string_so_far += character
+ subtrie = self.unpack_trie(model, subtrie[character], trie_is_compressed)
+ else:
+ #if everything_or_nothing and current_index == total_length: return []
+ if character == word_separator or current_index == total_length: # - 1:
+ if model.ENTITY_KEY in subtrie:
+ found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
+ if found_object:
+ identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
+ ret.append((identified[0], identified[1], string_so_far, start_index + 1, end_index))
+ shorter_alternative = None
+ else:
+ if shorter_alternative:
+ ret.append(shorter_alternative)
+ shorter_alternative = None
+ else:
+ if shorter_alternative:
+ ret.append(shorter_alternative)
+ shorter_alternative = None
+ start_index = current_index
+ else:
+ if shorter_alternative:
+ ret.append(shorter_alternative)
+ shorter_alternative = None
+ if temporary_index == -1:
+ reading_entity = False
+ else:
+ current_index = temporary_index
+ temporary_index = -1
+ reading_entity = True
+ string_so_far = ''
+ start_index = current_index
+ subtrie = trie[model.CONTENT_KEY][normalizer_name]
+ current_index += 1
+ if model.ENTITY_KEY in subtrie:
+ found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
+ if found_object:
+ identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
+ ret.append((identified[0], identified[1], string_so_far, start_index + 1, current_index - 1))
+ elif shorter_alternative:
+ ret.append(shorter_alternative)
+ elif shorter_alternative:
+ ret.append(shorter_alternative)
+ if model[model.KEYWORDS_KEY] is not None:
+ self.verify_keywords(model, ret, source_string, word_separator)
+ rets += ret
+ self.logger('Done.')
+ return rets
+
+ def flatten(self, layers):
+ ret = {}
+ self.o_cursor.execute('delete from flatten;')
+ self.o_connection.commit()
+ for layer in layers:
+ _map = layer[0]
+ _recognized = layer[1]
+ for span in _recognized:
+ _ids, _content, _left, _right = span[0], span[1], _map[span[3]], _map[span[4]]
+ for _id in _ids:
+ _attrs = _content[_id]
+ for _attr_name in _attrs:
+ for _attr_value in _attrs[_attr_name]:
+ self.o_cursor.execute('insert into flatten (l, r, attr_name, attr_value) select ?, ?, ?, ?;', (_left, _right, _attr_name, _attr_value))
+ rows = self.o_cursor.execute('select f1.l, f1.r, f1.attr_name, f1.attr_value from flatten f1 where not exists (select f2.* from flatten f2 where (f2.l <= f1.l and f2.r > f1.r) or (f2.l < f1.l and f2.r >= f1.r)) order by f1.l asc, f1.r asc;')
+ for row in rows:
+ _location, _attr_name, _attr_value = tuple([int(row[0]), int(row[1])]), str(row[2]), str(row[3])
+ if _location not in ret:
+ ret[_location] = {}
+ if _attr_name not in ret[_location]:
+ ret[_location][_attr_name] = set()
+ ret[_location][_attr_name].add(_attr_value)
+ return ret
+
+ def reduce(self, segments):
+ def intersects(segment1, segment2):
+ return segment2[0] >= segment1[0] and segment2[0] <= segment1[1]
+ def length(segment):
+ return segment[1] - segment[0]
+ sorted_segments = [[x] for x in sorted(sorted(segments, key=lambda x: x[1] - x[0]), key=lambda x: x[0])]
+ for i in range(len(sorted_segments) - 1):
+ if len(sorted_segments[i]) == 0:
+ continue
+ if intersects(sorted_segments[i][0], sorted_segments[i+1][0]):
+ if length(sorted_segments[i][0]) >= length(sorted_segments[i+1][0]):
+ sorted_segments[i+1] = sorted_segments[i]
+ sorted_segments[i] = []
+ elif length(sorted_segments[i][0]) < length(sorted_segments[i+1][0]):
+ recovered = False
+ for j in range(1, len(sorted_segments[i])):
+ if not intersects(sorted_segments[i][j], sorted_segments[i+1][0]):
+ sorted_segments[i][0] = sorted_segments[i][j]
+ recovered = True
+ break
+ if not recovered:
+ sorted_segments[i+1] += sorted_segments[i]
+ sorted_segments[i] = []
+ ret = [x[0] for x in sorted_segments if len(x) > 0]
+ return ret
+
+ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
+ attributes = attrs_where
+ if attributes is None:
+ attributes = {}
+ for action in ['+', '-']:
+ if action not in attributes:
+ attributes[action] = {}
+
+ process_exclude = False
+ include_set, include_query = set(), ''
+ for attr_name in attributes['+']:
+ for attr_value in attributes['+'][attr_name]:
+ include_set.add('(attr_name = \'' + attr_name.replace('\'', '\'\'') + '\' and attr_value = \'' + attr_value.replace('\'', '\'\'') + '\')')
+ if len(include_set) > 0:
+ include_query = 'and (' + ' or '.join(include_set) + ')'
+
+ exclude_set, exclude_query = set(), ''
+ for attr_name in attributes['-']:
+ for attr_value in attributes['-'][attr_name]:
+ exclude_set.add('(attr_name = \'' + attr_name.replace('\'', '\'\'') + '\' and attr_value = \'' + attr_value.replace('\'', '\'\'') + '\')')
+ if len(exclude_set) > 0:
+ exclude_query = 'and (' + ' or '.join(exclude_set) + ')'
+ process_exclude = True
+
+ attrs_out_query = ''
+ if attrs_out is not None and len(attrs_out) > 0:
+ attrs_out_query = ' and attr_name in (\'%s\')' % ('\', \''.join([x.replace('\'', '\'\'') for x in attrs_out]))
+
+ rets = []
+ for normalizer_name in model[model.NORMALIZER_KEY]:
+ normalized_string = model[model.NORMALIZER_KEY][normalizer_name].normalize(source_string, model[model.WORD_SEPARATOR_KEY], model[model.TOKENIZER_OPTION_KEY])
+ character_map = model[model.NORMALIZER_KEY][normalizer_name].result['map']
+ parsed = self.spot_entities(model, normalized_string, normalizer_name, include_query, exclude_query, process_exclude, attrs_out_query)
+ rets.append((character_map, parsed))
+
+ flattened = self.flatten(rets)
+ locations = self.reduce(flattened.keys())
+ ret = {location: flattened[location] for location in locations}
+ return ret
diff --git a/scripts/linux/vscode.sh b/scripts/linux/vscode.sh
new file mode 100755
index 0000000..62b4cb2
--- /dev/null
+++ b/scripts/linux/vscode.sh
@@ -0,0 +1,14 @@
+cd `dirname $0`
+MYDIR=`pwd`
+ROOT=${MYDIR}/../..
+ENV=.env.36
+cd -
+mkdir -p ${ROOT}/.vscode
+echo '{'>${ROOT}/.vscode/settings.json
+echo ' "python.pythonPath": "${workspaceFolder}/'${ENV}'/bin/python3",'>>${ROOT}/.vscode/settings.json
+echo ' "code-runner.executorMap": {"python": "./'${ENV}'/bin/python3"}'>>${ROOT}/.vscode/settings.json
+echo '}'>>${ROOT}/.vscode/settings.json
+echo '{'>${ROOT}/.markdownlint.json
+echo ' "MD024": {"siblings_only": true},'>>${ROOT}/.markdownlint.json
+echo ' "MD013": {"line_length": 1000}'>>${ROOT}/.markdownlint.json
+echo '}'>>${ROOT}/.markdownlint.json
diff --git a/scripts/win/vscode.bat b/scripts/win/vscode.bat
new file mode 100644
index 0000000..2ef5a69
--- /dev/null
+++ b/scripts/win/vscode.bat
@@ -0,0 +1,12 @@
+@echo off
+set ROOT=%~dp0..\..
+set ENV=.env.37
+if not exist %ROOT%\.vscode\nul mkdir %ROOT%\.vscode
+echo {>%ROOT%\.vscode\settings.json
+echo "python.pythonPath": "${workspaceFolder}\\%ENV%\\Scripts\\python.exe",>>%ROOT%\.vscode\settings.json
+echo "code-runner.executorMap": {"python": "call .\\%ENV%\\Scripts\\python"}>>%ROOT%\.vscode\settings.json
+echo }>>%ROOT%\.vscode\settings.json
+echo {>%ROOT%\.markdownlint.json
+echo "MD024": {"siblings_only": true},>>%ROOT%\.markdownlint.json
+echo "MD013": {"line_length": 1000}>>%ROOT%\.markdownlint.json
+echo }>>%ROOT%\.markdownlint.json
diff --git a/test/assets/sample_dictionary.txt b/test/assets/sample_dictionary.txt
new file mode 100644
index 0000000..ad90ce7
--- /dev/null
+++ b/test/assets/sample_dictionary.txt
@@ -0,0 +1,7 @@
+tokenizer1 entity2 acinic cell carcinomas C,D,E
+tokenizer1 entity2 acinic cell carcinomax D,E
+tokenizer2 entity1 acinic cell carcinomas A,B,C
+tokenizer1 entity1 acinic cell carcinoma A,B,C
+tokenizer2 entity1 afinic cell carcinoma A,B,C
+tokenizer2 entity1 it A,B,C
+tokenizer2 entity1 o A,B,C
diff --git a/test/assets/tokenizer1.xml b/test/assets/tokenizer1.xml
new file mode 100644
index 0000000..759fe98
--- /dev/null
+++ b/test/assets/tokenizer1.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
diff --git a/test/assets/tokenizer2.xml b/test/assets/tokenizer2.xml
new file mode 100644
index 0000000..16d6b32
--- /dev/null
+++ b/test/assets/tokenizer2.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
diff --git a/test/sandbox.py b/test/sandbox.py
new file mode 100644
index 0000000..6129ac0
--- /dev/null
+++ b/test/sandbox.py
@@ -0,0 +1,52 @@
+import sys; sys.path.insert(0, '')
+import pilsner # pylint: disable=E0611,F0401
+
+_messages = []
+_status = []
+
+def callback_update_mesage(message):
+ _messages.append(message)
+
+def callback_update_status(status):
+ _status.append(status)
+
+def save_it():
+ m = pilsner.Model()
+ m.add_normalizer('tokenizer1', 'test/assets/tokenizer1.xml')
+ m.add_normalizer('tokenizer2', 'test/assets/tokenizer2.xml')
+ m.normalizer_map = {
+ 'tokenizer1': 'tokenizer1',
+ 'tokenizer2': 'tokenizer2'
+ }
+ r = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
+ specs = {'DType': (0, None, True, False), 'MSID': (1, None, False, False), 'value': (2, None, False, True)}
+ fields = [
+ {'name': 'DType', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
+ {'name': 'MSID', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'Value', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
+ {'name': 'smth', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
+ ]
+ specs = r.compile_dict_specs(fields)
+ r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=2, include_keywords=True)
+ s = 'this is afinic cell carcinoma o carcinoma, damn it'
+ q = r.parse(m, s)
+ print(q)
+ m.save('.test_model')
+
+def load_it():
+ rrr = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
+ m = pilsner.Model('.test_model')
+ s = 'this is acinic cell carcinomas o carcinoma, damn it'
+ q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
+ print(q)
+
+save_it()
+load_it()
+
+#segments = [tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])]
+#r = Recognizer()
+#red = r.reduce(segments)
+#print(red)
+
+print(_messages)
+print(_status)
From 2bd1f471cd8ecfa1dedf152e350b3e67bfc611b8 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Mon, 10 Aug 2020 16:29:20 -0400
Subject: [PATCH 002/116] progress reporting while parsing
---
pilsner/utility.py | 24 ++++++++++++++++++++++--
test/sandbox.py | 1 +
2 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index aa2c1fb..6b83925 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -245,10 +245,16 @@ def attribute_unpacker(self, cur, leaf_ids, include_query, exclude_query, proces
attributes[n][attr_name].append(attr_value)
return attributes
- def spot_entities(self, model, source_string, normalizer_name, include_query='', exclude_query='', process_exclude=False, attrs_out_query=''):
+ def spot_entities(self, model, source_string, normalizer_name, include_query='', exclude_query='', process_exclude=False, attrs_out_query='', progress_from=0, progress_to=100):
# TODO: review for refactoring
self.logger('Analyzing "%s"... ' % (source_string))
rets = []
+ this_progress_position = 0
+ last_progress_position = 0
+ total_tries = len(model[model.DICTIONARY_KEY])
+ progress_share = progress_to - progress_from
+ trie_increment = int(progress_share / total_tries)
+ current_trie_index = 0
for trie in model[model.DICTIONARY_KEY]:
ret = []
word_separator = trie[model.WORD_SEPARATOR_KEY]
@@ -260,8 +266,13 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
current_index = 0
temporary_index = -1
total_length = len(source_string)
+ increment_chars = int(total_length / progress_share) if total_length > progress_share else total_length - 1
dictionary_specs = trie[model.SPECS_KEY]['fields'].keys()
while current_index < total_length:
+ this_progress_position = int(current_index / increment_chars / total_tries)
+ if this_progress_position != last_progress_position:
+ last_progress_position = this_progress_position
+ self.push_message(int(progress_share * current_index / total_length / total_tries) + current_trie_index * trie_increment + progress_from, self.callback_progress)
if len(ret) > 0 and current_index < ret[-1][-1]:
current_index = ret[-1][-1]
if not reading_entity: # wait for word separator
@@ -327,6 +338,8 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
if model[model.KEYWORDS_KEY] is not None:
self.verify_keywords(model, ret, source_string, word_separator)
rets += ret
+ current_trie_index += 1
+ self.push_message(progress_to, self.callback_progress)
self.logger('Done.')
return rets
@@ -408,12 +421,19 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
if attrs_out is not None and len(attrs_out) > 0:
attrs_out_query = ' and attr_name in (\'%s\')' % ('\', \''.join([x.replace('\'', '\'\'') for x in attrs_out]))
+ self.push_message('Parsing text', self.callback_status)
rets = []
+ total_normalizers = len(model[model.NORMALIZER_KEY])
+ spot_progress_share = int(100 / total_normalizers)
+ current_normalizer_index = 0
for normalizer_name in model[model.NORMALIZER_KEY]:
normalized_string = model[model.NORMALIZER_KEY][normalizer_name].normalize(source_string, model[model.WORD_SEPARATOR_KEY], model[model.TOKENIZER_OPTION_KEY])
character_map = model[model.NORMALIZER_KEY][normalizer_name].result['map']
- parsed = self.spot_entities(model, normalized_string, normalizer_name, include_query, exclude_query, process_exclude, attrs_out_query)
+ progress_from = current_normalizer_index * spot_progress_share
+ progress_to = (current_normalizer_index + 1) * spot_progress_share
+ parsed = self.spot_entities(model, normalized_string, normalizer_name, include_query, exclude_query, process_exclude, attrs_out_query, progress_from=progress_from, progress_to=progress_to)
rets.append((character_map, parsed))
+ current_normalizer_index += 1
flattened = self.flatten(rets)
locations = self.reduce(flattened.keys())
diff --git a/test/sandbox.py b/test/sandbox.py
index 6129ac0..76f1adb 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -37,6 +37,7 @@ def load_it():
rrr = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
m = pilsner.Model('.test_model')
s = 'this is acinic cell carcinomas o carcinoma, damn it'
+ s *= 10
q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
print(q)
From f34de1b89b0e013a5a7bf738cf47e84788510f69 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Tue, 11 Aug 2020 00:49:31 -0400
Subject: [PATCH 003/116] flatten() without SQL
---
pilsner/utility.py | 32 +++++++++++++++-----------------
test/sandbox.py | 11 ++++++++++-
2 files changed, 25 insertions(+), 18 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 6b83925..841f2b8 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -1,13 +1,11 @@
import logging
-import sqlite3
import os
class Recognizer():
- OPERATIONAL_STORAGE = ''
PERMANENT_STORAGE = ''
- def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, callback_progress=None, permanent_storage='', operational_storage=':memory:'):
+ def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, callback_progress=None, permanent_storage=''):
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
self.debug = debug_mode
self.verbose = verbose_mode
@@ -20,20 +18,11 @@ def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, c
self.callback_status = callback_status
self.callback_progress = callback_progress
self.PERMANENT_STORAGE = permanent_storage
- self.OPERATIONAL_STORAGE = operational_storage
- self.o_connection = sqlite3.connect(self.OPERATIONAL_STORAGE)
- self.o_cursor = self.o_connection.cursor()
- self.o_cursor.execute('create table if not exists flatten (l integer, r integer, attr_name text, attr_value text);')
- self.o_cursor.execute('delete from flatten;')
- self.o_connection.commit()
logging.debug('Recognizer class has been initialized')
def __del__(self):
# remove all temporary resources
- self.o_connection.close()
- if os.path.exists(self.OPERATIONAL_STORAGE):
- os.remove(self.OPERATIONAL_STORAGE)
-
+ pass
def push_message(self, message, callback_function):
if callback_function is not None:
callback_function(message)
@@ -344,9 +333,11 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
return rets
def flatten(self, layers):
+ # TODO: prettify this
+ #print(layers)
+ #exit(0)
ret = {}
- self.o_cursor.execute('delete from flatten;')
- self.o_connection.commit()
+ qwe = []
for layer in layers:
_map = layer[0]
_recognized = layer[1]
@@ -356,8 +347,15 @@ def flatten(self, layers):
_attrs = _content[_id]
for _attr_name in _attrs:
for _attr_value in _attrs[_attr_name]:
- self.o_cursor.execute('insert into flatten (l, r, attr_name, attr_value) select ?, ?, ?, ?;', (_left, _right, _attr_name, _attr_value))
- rows = self.o_cursor.execute('select f1.l, f1.r, f1.attr_name, f1.attr_value from flatten f1 where not exists (select f2.* from flatten f2 where (f2.l <= f1.l and f2.r > f1.r) or (f2.l < f1.l and f2.r >= f1.r)) order by f1.l asc, f1.r asc;')
+ qwe.append(tuple([_left, _right, _attr_name, _attr_value]))
+ qwes = sorted(sorted(qwe, key=lambda x: -x[1]), key=lambda x: x[0])
+ rows = [qwes[0]]
+ for i in range(1, len(qwes)):
+ q = qwes[i]
+ if (rows[-1][0] <= q[0] < rows[-1][1] and rows[-1][0] < q[1] < rows[-1][1]) or (rows[-1][0] < q[0] < rows[-1][1] and rows[-1][0] < q[1] <= rows[-1][1]):
+ continue
+ rows.append(q)
+ #rows = qwes
for row in rows:
_location, _attr_name, _attr_value = tuple([int(row[0]), int(row[1])]), str(row[2]), str(row[3])
if _location not in ret:
diff --git a/test/sandbox.py b/test/sandbox.py
index 76f1adb..a45960f 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -41,7 +41,7 @@ def load_it():
q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
print(q)
-save_it()
+#save_it()
load_it()
#segments = [tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])]
@@ -51,3 +51,12 @@ def load_it():
print(_messages)
print(_status)
+
+#layers = [([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([0], {0: {'MSID': ['entity2'], 'smth': ['C', 'D', 'E']}}, 'acinic cell carcino mas', 8, 31)]), ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([2], {2: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'acinic carcinomas', 8, 25), ([5], {5: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'it', 45, 46), ([6], {6: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'o', 26, 27)])]
+#layers = [([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([0], {0: {'MSID': ['entity2'], 'smth': ['C', 'D', 'E']}}, 'acinic cell carcino mas', 8, 31)]), ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([2], {2: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'acinic carcinomas', 8, 26), ([5], {5: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'it', 45, 46), ([6], {6: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'o', 26, 27)])]
+
+#rrr = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
+#x = rrr.flatten(layers)
+#print(x)
+
+
From 2b6f69820134a0d9c8c3524aae6415c58b096eb1 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Tue, 11 Aug 2020 02:15:01 -0400
Subject: [PATCH 004/116] removed permanent_storage
---
pilsner/utility.py | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 841f2b8..be4325a 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -3,9 +3,7 @@
class Recognizer():
- PERMANENT_STORAGE = ''
-
- def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, callback_progress=None, permanent_storage=''):
+ def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, callback_progress=None):
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
self.debug = debug_mode
self.verbose = verbose_mode
@@ -17,7 +15,6 @@ def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, c
self.logger('Debug mode is on')
self.callback_status = callback_status
self.callback_progress = callback_progress
- self.PERMANENT_STORAGE = permanent_storage
logging.debug('Recognizer class has been initialized')
def __del__(self):
From 2467e34fcb536dedb2a047d00a3ca3522812b027 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Tue, 11 Aug 2020 11:15:09 -0400
Subject: [PATCH 005/116] polished the code
---
pilsner/utility.py | 35 +++++++++++++----------------------
1 file changed, 13 insertions(+), 22 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index be4325a..f23b0cf 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -20,6 +20,7 @@ def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, c
def __del__(self):
# remove all temporary resources
pass
+
def push_message(self, message, callback_function):
if callback_function is not None:
callback_function(message)
@@ -205,7 +206,6 @@ def check_attrs(self, model, trie_leaf, cur, specs, include_query, exclude_query
def attribute_unpacker(self, cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query):
attributes = {}
-
include = set()
exclude = set()
for n in leaf_ids:
@@ -217,9 +217,7 @@ def attribute_unpacker(self, cur, leaf_ids, include_query, exclude_query, proces
rows = cur.execute('select distinct n from attrs where n = %d %s;' % (n, exclude_query))
for row in rows:
exclude.add(int(row[0]))
-
ns = include - exclude
-
for n in ns:
rows = cur.execute('select attr_name, attr_value from attrs where n = %d%s;' % (n, attrs_out_query))
if n not in attributes:
@@ -330,11 +328,8 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
return rets
def flatten(self, layers):
- # TODO: prettify this
- #print(layers)
- #exit(0)
ret = {}
- qwe = []
+ all_entries = []
for layer in layers:
_map = layer[0]
_recognized = layer[1]
@@ -344,17 +339,16 @@ def flatten(self, layers):
_attrs = _content[_id]
for _attr_name in _attrs:
for _attr_value in _attrs[_attr_name]:
- qwe.append(tuple([_left, _right, _attr_name, _attr_value]))
- qwes = sorted(sorted(qwe, key=lambda x: -x[1]), key=lambda x: x[0])
- rows = [qwes[0]]
- for i in range(1, len(qwes)):
- q = qwes[i]
- if (rows[-1][0] <= q[0] < rows[-1][1] and rows[-1][0] < q[1] < rows[-1][1]) or (rows[-1][0] < q[0] < rows[-1][1] and rows[-1][0] < q[1] <= rows[-1][1]):
+ all_entries.append(tuple([_left, _right, _attr_name, _attr_value]))
+ all_entries = sorted(sorted(all_entries, key=lambda x: -x[1]), key=lambda x: x[0])
+ filtered_entries = [all_entries[0]]
+ for i in range(1, len(all_entries)):
+ q = all_entries[i]
+ if (filtered_entries[-1][0] <= q[0] < filtered_entries[-1][1] and filtered_entries[-1][0] < q[1] < filtered_entries[-1][1]) or (filtered_entries[-1][0] < q[0] < filtered_entries[-1][1] and filtered_entries[-1][0] < q[1] <= filtered_entries[-1][1]):
continue
- rows.append(q)
- #rows = qwes
- for row in rows:
- _location, _attr_name, _attr_value = tuple([int(row[0]), int(row[1])]), str(row[2]), str(row[3])
+ filtered_entries.append(q)
+ for entry in filtered_entries:
+ _location, _attr_name, _attr_value = tuple([int(entry[0]), int(entry[1])]), str(entry[2]), str(entry[3])
if _location not in ret:
ret[_location] = {}
if _attr_name not in ret[_location]:
@@ -395,7 +389,6 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
for action in ['+', '-']:
if action not in attributes:
attributes[action] = {}
-
process_exclude = False
include_set, include_query = set(), ''
for attr_name in attributes['+']:
@@ -403,7 +396,6 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
include_set.add('(attr_name = \'' + attr_name.replace('\'', '\'\'') + '\' and attr_value = \'' + attr_value.replace('\'', '\'\'') + '\')')
if len(include_set) > 0:
include_query = 'and (' + ' or '.join(include_set) + ')'
-
exclude_set, exclude_query = set(), ''
for attr_name in attributes['-']:
for attr_value in attributes['-'][attr_name]:
@@ -411,11 +403,10 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
if len(exclude_set) > 0:
exclude_query = 'and (' + ' or '.join(exclude_set) + ')'
process_exclude = True
-
attrs_out_query = ''
if attrs_out is not None and len(attrs_out) > 0:
attrs_out_query = ' and attr_name in (\'%s\')' % ('\', \''.join([x.replace('\'', '\'\'') for x in attrs_out]))
-
+ self.logger('Parsing text...')
self.push_message('Parsing text', self.callback_status)
rets = []
total_normalizers = len(model[model.NORMALIZER_KEY])
@@ -429,8 +420,8 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
parsed = self.spot_entities(model, normalized_string, normalizer_name, include_query, exclude_query, process_exclude, attrs_out_query, progress_from=progress_from, progress_to=progress_to)
rets.append((character_map, parsed))
current_normalizer_index += 1
-
flattened = self.flatten(rets)
locations = self.reduce(flattened.keys())
ret = {location: flattened[location] for location in locations}
+ self.logger('Done parsing text.')
return ret
From 2df33ba2c6820ef7a55eb9aa7c3b0e7c37522ef9 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Tue, 11 Aug 2020 11:49:47 -0400
Subject: [PATCH 006/116] added placeholders and scripts
---
scripts/linux/unittest.sh | 14 ++++++++++++++
scripts/win/unittest.bat | 12 ++++++++++++
test/performance.py | 0
test/ut_model.py | 14 ++++++++++++++
test/ut_utility.py | 14 ++++++++++++++
5 files changed, 54 insertions(+)
create mode 100644 scripts/linux/unittest.sh
create mode 100644 scripts/win/unittest.bat
create mode 100644 test/performance.py
create mode 100644 test/ut_model.py
create mode 100644 test/ut_utility.py
diff --git a/scripts/linux/unittest.sh b/scripts/linux/unittest.sh
new file mode 100644
index 0000000..ec23305
--- /dev/null
+++ b/scripts/linux/unittest.sh
@@ -0,0 +1,14 @@
+RUNDIR=`pwd`
+cd `dirname $0`
+MYDIR=`pwd`
+ROOT=${MYDIR}/../..
+ENV=.env.36
+TEST=${ROOT}/test
+FILES="ut_model.py ut_utility.py performance.py"
+cd ${ROOT}
+for FILE in ${FILES}
+do
+ echo Running ${FILE}
+ ${ROOT}/${ENV}/bin/python3 ${TEST}/${FILE} -b
+done
+cd ${RUNDIR}
diff --git a/scripts/win/unittest.bat b/scripts/win/unittest.bat
new file mode 100644
index 0000000..ae32a49
--- /dev/null
+++ b/scripts/win/unittest.bat
@@ -0,0 +1,12 @@
+@echo off
+set RUNDIR=%cd%
+set ROOT=%~dp0..\..
+set ENV=.env.37
+set TEST=%ROOT%\test
+set FILES=ut_model.py ut_utility.py performance.py
+cd %ROOT%
+(for %%f in (%FILES%) do (
+ echo Running %%f
+ call %ROOT%\%ENV%\Scripts\python.exe %TEST%\%%f -b
+))
+cd %RUNDIR%
diff --git a/test/performance.py b/test/performance.py
new file mode 100644
index 0000000..e69de29
diff --git a/test/ut_model.py b/test/ut_model.py
new file mode 100644
index 0000000..16dd590
--- /dev/null
+++ b/test/ut_model.py
@@ -0,0 +1,14 @@
+import sys; sys.path.insert(0, '')
+import unittest
+import pilsner # pylint: disable=E0611,F0401
+
+class TestModel(unittest.TestCase):
+
+ def setUp(self):
+ pass
+
+ def tearDown(self):
+ pass
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/ut_utility.py b/test/ut_utility.py
new file mode 100644
index 0000000..16dd590
--- /dev/null
+++ b/test/ut_utility.py
@@ -0,0 +1,14 @@
+import sys; sys.path.insert(0, '')
+import unittest
+import pilsner # pylint: disable=E0611,F0401
+
+class TestModel(unittest.TestCase):
+
+ def setUp(self):
+ pass
+
+ def tearDown(self):
+ pass
+
+if __name__ == '__main__':
+ unittest.main()
From 6ed89d1332cfa202414a5dad258c8942d8782bb0 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Wed, 12 Aug 2020 18:39:54 -0400
Subject: [PATCH 007/116] updated names of functions
---
pilsner/utility.py | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index f23b0cf..037ddc9 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -327,7 +327,7 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
self.logger('Done.')
return rets
- def flatten(self, layers):
+ def flatten_spans(self, layers):
ret = {}
all_entries = []
for layer in layers:
@@ -356,7 +356,7 @@ def flatten(self, layers):
ret[_location][_attr_name].add(_attr_value)
return ret
- def reduce(self, segments):
+ def reduce_spans(self, segments):
def intersects(segment1, segment2):
return segment2[0] >= segment1[0] and segment2[0] <= segment1[1]
def length(segment):
@@ -420,8 +420,8 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
parsed = self.spot_entities(model, normalized_string, normalizer_name, include_query, exclude_query, process_exclude, attrs_out_query, progress_from=progress_from, progress_to=progress_to)
rets.append((character_map, parsed))
current_normalizer_index += 1
- flattened = self.flatten(rets)
- locations = self.reduce(flattened.keys())
+ flattened = self.flatten_spans(rets)
+ locations = self.reduce_spans(flattened.keys())
ret = {location: flattened[location] for location in locations}
self.logger('Done parsing text.')
return ret
From 61e3f05b54eceee8d1c5dd9fde3ff3e2878db8ca Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Wed, 12 Aug 2020 18:40:21 -0400
Subject: [PATCH 008/116] prettified attribute_wrapper
---
pilsner/model.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index 3dd51fa..8ce1dfd 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -179,7 +179,7 @@ def attribute_wrapper(self, line_number, normalizer_name, internal_id, subtrie,
if not specs['fields'][k][1]:
self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, columns[specs['fields'][k][0]]))
else:
- _ = [ self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, s)) for s in set(columns[specs['fields'][k][0]].split( specs['fields'][k][1]) ) ]
+ _ = [self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, s)) for s in set(columns[specs['fields'][k][0]].split(specs['fields'][k][1]))]
def get_dictionary_line(self, specs, entity_ids, line_numbers, line_number, line, column_separator, cell_wall):
columns = [x.strip(cell_wall) for x in line.split(column_separator)]
From c142c655ad4007751e2ff04618ac079b6fe04407 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Wed, 12 Aug 2020 18:40:42 -0400
Subject: [PATCH 009/116] placeholders for unit tests
---
test/ut_model.py | 36 ++++++++++++++++++++++++++++++++++++
test/ut_utility.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 81 insertions(+)
diff --git a/test/ut_model.py b/test/ut_model.py
index 16dd590..0127d36 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -10,5 +10,41 @@ def setUp(self):
def tearDown(self):
pass
+ def test_init(self):
+ pass
+
+ def test_del(self):
+ pass
+
+ def test_save(self):
+ pass
+
+ def test_load(self):
+ pass
+
+ def test_add_normalizer(self):
+ pass
+
+ def test_create_recognizer_schema(self):
+ pass
+
+ def test_pack_subtrie(self):
+ pass
+
+ def test_pack_trie(self):
+ pass
+
+ def test_attribute_wrapper(self):
+ pass
+
+ def test_get_dictionary_line(self):
+ pass
+
+ def test_get_dictionary_synonym(self):
+ pass
+
+ def test_next_trie(self):
+ pass
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 16dd590..7fe3a47 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -10,5 +10,50 @@ def setUp(self):
def tearDown(self):
pass
+ def test_init(self):
+ pass
+
+ def test_del(self):
+ pass
+
+ def test_push_message(self):
+ pass
+
+ def test_compile_dict_specs(self):
+ pass
+
+ def test_make_recognizer(self):
+ pass
+
+ def test_make_keywords(self):
+ pass
+
+ def test_compile_model(self):
+ pass
+
+ def test_verify_keywords(self):
+ pass
+
+ def test_unpack_trie(self):
+ pass
+
+ def test_check_attrs(self):
+ pass
+
+ def test_attribute_unpacker(self):
+ pass
+
+ def test_spot_entities(self):
+ pass
+
+ def test_flatten_spans(self):
+ pass
+
+ def test_reduce_spans(self):
+ pass
+
+ def test_parse(self):
+ pass
+
if __name__ == '__main__':
unittest.main()
From 3721f50007161741ba3323d21c8181dcb1e0a42c Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Wed, 12 Aug 2020 21:39:54 -0400
Subject: [PATCH 010/116] unit tests for constructors and destructors
---
test/ut_model.py | 13 +++++++++++--
test/ut_utility.py | 8 ++++++--
2 files changed, 17 insertions(+), 4 deletions(-)
diff --git a/test/ut_model.py b/test/ut_model.py
index 0127d36..8e8d533 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -1,3 +1,4 @@
+import os
import sys; sys.path.insert(0, '')
import unittest
import pilsner # pylint: disable=E0611,F0401
@@ -11,10 +12,18 @@ def tearDown(self):
pass
def test_init(self):
- pass
+ m = pilsner.Model()
+ assert 'm' in locals(), 'Instance of Model class has not been created'
+ assert type(m) == pilsner.Model, 'Model is expected to have pilsner.Model type, but has %s instead' % (str(type(m)))
+ storage = m.DEFAULT_DATASOURCE
+ assert storage.lower() == ':memory:' or os.path.exists(storage), 'Model storage is not where it is supposed to be'
def test_del(self):
- pass
+ m = pilsner.Model()
+ storage = m.DEFAULT_DATASOURCE
+ del(m)
+ assert 'm' not in locals(), 'Instance of Model class has not been destroyed'
+ assert storage.lower() == ':memory:' or not os.path.exists(storage), 'Model storage is supposed to be removed once class has been destroyed'
def test_save(self):
pass
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 7fe3a47..c2399c0 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -11,10 +11,14 @@ def tearDown(self):
pass
def test_init(self):
- pass
+ r = pilsner.Recognizer()
+ assert 'r' in locals(), 'Instance of Recognizer class has not been created'
+ assert type(r) == pilsner.Recognizer, 'Utility is supposed to have pilsner.Recognizer type, but has %s instead' % (str(type(r)))
def test_del(self):
- pass
+ r = pilsner.Recognizer()
+ del(r)
+ assert 'r' not in locals(), 'Instance of Recognizer class has not been destroyed'
def test_push_message(self):
pass
From 88882d2deb7df51de6438f387308782f70697a96 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Thu, 13 Aug 2020 13:57:50 -0400
Subject: [PATCH 011/116] test_save, test_load
---
test/ut_model.py | 29 +++++++++++++++++++++++++++--
1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/test/ut_model.py b/test/ut_model.py
index 8e8d533..afe78cc 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -26,10 +26,35 @@ def test_del(self):
assert storage.lower() == ':memory:' or not os.path.exists(storage), 'Model storage is supposed to be removed once class has been destroyed'
def test_save(self):
- pass
+ m = pilsner.Model()
+ m[m.DICTIONARY_KEY].append({})
+ m.save('./.test_save')
+ del(m)
+ assert os.path.exists('./.test_save.0.dictionary'), 'Dictionary file was not saved'
+ assert os.path.exists('./.test_save.attributes'), 'Attributes file was not saved'
+ assert os.path.exists('./.test_save.keywords'), 'Keywords file was not saved'
+ assert os.path.exists('./.test_save.normalizers'), 'Normalizers file was not saved'
+ os.remove('./.test_save.0.dictionary')
+ os.remove('./.test_save.attributes')
+ os.remove('./.test_save.keywords')
+ os.remove('./.test_save.normalizers')
def test_load(self):
- pass
+ m1 = pilsner.Model()
+ m1[m1.DICTIONARY_KEY].append({'a': {'b': {'c': 'def'}}})
+ m1[m1.DICTIONARY_KEY].append({'g': {'h': {'i': 'jkl'}}})
+ m1.save('./.test_load')
+ expected = m1[m1.DICTIONARY_KEY]
+ del(m1)
+ m2 = pilsner.Model()
+ m2.load('./.test_load')
+ assert m2[m2.DICTIONARY_KEY] == expected, 'Loaded model %s != saved model %s' % (str(m2[m2.DICTIONARY_KEY]), str(expected))
+ del(m2)
+ os.remove('./.test_load.0.dictionary')
+ os.remove('./.test_load.1.dictionary')
+ os.remove('./.test_load.attributes')
+ os.remove('./.test_load.keywords')
+ os.remove('./.test_load.normalizers')
def test_add_normalizer(self):
pass
From aed6b9b4bbd537d7e4fcea14c98fa0f4f75964a4 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Thu, 13 Aug 2020 13:59:41 -0400
Subject: [PATCH 012/116] renamed TestUtility class
---
test/ut_utility.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/test/ut_utility.py b/test/ut_utility.py
index c2399c0..4220560 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -2,7 +2,7 @@
import unittest
import pilsner # pylint: disable=E0611,F0401
-class TestModel(unittest.TestCase):
+class TestUtility(unittest.TestCase):
def setUp(self):
pass
From 977220fb6c1725583f9f9eb84c66a0fdbfa71aaa Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Thu, 13 Aug 2020 16:51:27 -0400
Subject: [PATCH 013/116] unused arguments
---
pilsner/model.py | 2 +-
pilsner/utility.py | 3 ++-
2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index 8ce1dfd..7b4090a 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -169,7 +169,7 @@ def pack_trie(self, trie, compressed):
ret[self.CONTENT_KEY][normalizer_name] = packed
return ret
- def attribute_wrapper(self, line_number, normalizer_name, internal_id, subtrie, trie, specs, columns):
+ def attribute_wrapper(self, line_number, internal_id, subtrie, specs, columns):
if self.ENTITY_KEY not in subtrie:
subtrie[self.ENTITY_KEY] = []
subtrie[self.ENTITY_KEY].append(line_number)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 037ddc9..0f4506d 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -30,6 +30,7 @@ def compile_dict_specs(self, fields):
specs = {'fields': {}, 'id': None, 'tokenizer': None, 'value': None}
# {'name': 'DType', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
# specs = {'DType': (0, None, False, True, False), 'MSID': (1, None, True, False, False), 'value': (2, None, False, False, True)}
+ # specs = {'attr_name': (column_index, delimiter, normalizer_flag, value_flag)}
for i in range(0, len(fields)):
field = fields[i]
if not field['include']:
@@ -83,7 +84,7 @@ def make_recognizer(self, model, filename, specs, word_separator, item_limit, co
if character not in subtrie:
subtrie[character] = {}
subtrie = subtrie[character]
- model.attribute_wrapper(line_number, normalizer_name, internal_id, subtrie, trie, specs, columns)
+ model.attribute_wrapper(line_number, internal_id, subtrie, specs, columns)
line_count += 1
line_number += 1
if line_count > 0 and len(trie) > 3:
From 64b8b264986397d8c96ee1df996eb9862cbd49ff Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Thu, 13 Aug 2020 16:51:52 -0400
Subject: [PATCH 014/116] updated sandbox
---
test/sandbox.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/test/sandbox.py b/test/sandbox.py
index a45960f..e8499f1 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -27,7 +27,7 @@ def save_it():
{'name': 'smth', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
]
specs = r.compile_dict_specs(fields)
- r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=2, include_keywords=True)
+ r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=3, include_keywords=True)
s = 'this is afinic cell carcinoma o carcinoma, damn it'
q = r.parse(m, s)
print(q)
@@ -41,7 +41,7 @@ def load_it():
q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
print(q)
-#save_it()
+save_it()
load_it()
#segments = [tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])]
From 2eea936c9660c62b4d45dbcea3c8b02dd2ff3915 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Thu, 13 Aug 2020 17:03:41 -0400
Subject: [PATCH 015/116] functional test placeholder
---
scripts/linux/unittest.sh | 2 +-
scripts/win/unittest.bat | 2 +-
test/functional.py | 12 ++++++++++++
3 files changed, 14 insertions(+), 2 deletions(-)
create mode 100644 test/functional.py
diff --git a/scripts/linux/unittest.sh b/scripts/linux/unittest.sh
index ec23305..dd02fd2 100644
--- a/scripts/linux/unittest.sh
+++ b/scripts/linux/unittest.sh
@@ -4,7 +4,7 @@ MYDIR=`pwd`
ROOT=${MYDIR}/../..
ENV=.env.36
TEST=${ROOT}/test
-FILES="ut_model.py ut_utility.py performance.py"
+FILES="ut_model.py ut_utility.py functional.py performance.py"
cd ${ROOT}
for FILE in ${FILES}
do
diff --git a/scripts/win/unittest.bat b/scripts/win/unittest.bat
index ae32a49..4453ba7 100644
--- a/scripts/win/unittest.bat
+++ b/scripts/win/unittest.bat
@@ -3,7 +3,7 @@ set RUNDIR=%cd%
set ROOT=%~dp0..\..
set ENV=.env.37
set TEST=%ROOT%\test
-set FILES=ut_model.py ut_utility.py performance.py
+set FILES=ut_model.py ut_utility.py functional.py performance.py
cd %ROOT%
(for %%f in (%FILES%) do (
echo Running %%f
diff --git a/test/functional.py b/test/functional.py
new file mode 100644
index 0000000..c4911a1
--- /dev/null
+++ b/test/functional.py
@@ -0,0 +1,12 @@
+import os
+import sys; sys.path.insert(0, '')
+import unittest
+import pilsner # pylint: disable=E0611,F0401
+
+class FunctionalTest(unittest.TestCase):
+
+ def test_ad_hoc_load_model(self):
+ pass
+
+if __name__ == '__main__':
+ unittest.main()
From 03506f74ba10240ae2fc1651dab886d1697b6d08 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Thu, 13 Aug 2020 17:03:52 -0400
Subject: [PATCH 016/116] unit tests
---
test/ut_model.py | 56 +++++++++++++++++++++++++++++++-----------------
1 file changed, 36 insertions(+), 20 deletions(-)
diff --git a/test/ut_model.py b/test/ut_model.py
index afe78cc..b5a7c8b 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -6,10 +6,10 @@
class TestModel(unittest.TestCase):
def setUp(self):
- pass
+ self.model = pilsner.Model()
def tearDown(self):
- pass
+ del(self.model)
def test_init(self):
m = pilsner.Model()
@@ -26,10 +26,8 @@ def test_del(self):
assert storage.lower() == ':memory:' or not os.path.exists(storage), 'Model storage is supposed to be removed once class has been destroyed'
def test_save(self):
- m = pilsner.Model()
- m[m.DICTIONARY_KEY].append({})
- m.save('./.test_save')
- del(m)
+ self.model[self.model.DICTIONARY_KEY].append({})
+ self.model.save('./.test_save')
assert os.path.exists('./.test_save.0.dictionary'), 'Dictionary file was not saved'
assert os.path.exists('./.test_save.attributes'), 'Attributes file was not saved'
assert os.path.exists('./.test_save.keywords'), 'Keywords file was not saved'
@@ -40,16 +38,14 @@ def test_save(self):
os.remove('./.test_save.normalizers')
def test_load(self):
- m1 = pilsner.Model()
- m1[m1.DICTIONARY_KEY].append({'a': {'b': {'c': 'def'}}})
- m1[m1.DICTIONARY_KEY].append({'g': {'h': {'i': 'jkl'}}})
- m1.save('./.test_load')
- expected = m1[m1.DICTIONARY_KEY]
- del(m1)
- m2 = pilsner.Model()
- m2.load('./.test_load')
- assert m2[m2.DICTIONARY_KEY] == expected, 'Loaded model %s != saved model %s' % (str(m2[m2.DICTIONARY_KEY]), str(expected))
- del(m2)
+ self.model[self.model.DICTIONARY_KEY].append({'a': {'b': {'c': 'def'}}})
+ self.model[self.model.DICTIONARY_KEY].append({'g': {'h': {'i': 'jkl'}}})
+ self.model.save('./.test_load')
+ expected = self.model[self.model.DICTIONARY_KEY]
+ another_model = pilsner.Model()
+ another_model.load('./.test_load')
+ assert another_model[another_model.DICTIONARY_KEY] == expected, 'Loaded model %s != saved model %s' % (str(another_model[another_model.DICTIONARY_KEY]), str(expected))
+ del(another_model)
os.remove('./.test_load.0.dictionary')
os.remove('./.test_load.1.dictionary')
os.remove('./.test_load.attributes')
@@ -57,16 +53,36 @@ def test_load(self):
os.remove('./.test_load.normalizers')
def test_add_normalizer(self):
- pass
+ self.model.add_normalizer('t1', 'test/assets/tokenizer1.xml')
+ normalization_units_count = len(self.model[self.model.NORMALIZER_KEY])
+ assert normalization_units_count == 1, 'Model is expected to have 1 normalization unit (it has %d instead)' % (normalization_units_count)
def test_create_recognizer_schema(self):
- pass
+ self.model.create_recognizer_schema(self.model.cursor)
+ rows = self.model.cursor.execute('select name from sqlite_master where type = \'table\' and name = \'attrs\';')
+ assert len(list(rows)) == 1, 'Created schema does not contain table \'attrs\''
+ rows = self.model.cursor.execute('select * from attrs;')
+ assert len(list(rows)) == 0, 'Table \'attrs\' in newly created schema is not empty'
def test_pack_subtrie(self):
- pass
+ # radiology, radiotelescope
+ subtrie = {'r': {'a': {'d': {'i': {'o': {'l': {'o': {'g': {'y': {self.model.ENTITY_KEY: [1]}}}}, 't': {'e': {'l': {'e': {'s': {'c': {'o': {'p': {'e': {self.model.ENTITY_KEY: [2]}}}}}}}}}}}}}}}
+ initial_path = ''
+ packed = self.model.pack_subtrie(subtrie, False, initial_path)
+ assert packed[0] == subtrie, '%s != %s' % (str(packed), str(subtrie))
+ assert packed[1] == '', 'pack_subtrie() function is supposed to return \'%s\' as path (it returned \'%s\')' % (initial_path, packed[1])
+ packed = self.model.pack_subtrie(subtrie, True, initial_path)
+ expected = {'r': {'adio': {'l': {'ogy': {self.model.ENTITY_KEY: [1]}}, 't': {'elescope': {self.model.ENTITY_KEY: [2]}}}}}
+ assert packed[0] == expected, '%s != %s' % (str(packed), str(expected))
def test_pack_trie(self):
- pass
+ # radiology, radiotelescope
+ tries = {self.model.CONTENT_KEY: {'t1': {'r': {'a': {'d': {'i': {'o': {'l': {'o': {'g': {'y': {self.model.ENTITY_KEY: [1]}}}}, 't': {'e': {'l': {'e': {'s': {'c': {'o': {'p': {'e': {self.model.ENTITY_KEY: [2]}}}}}}}}}}}}}}}, 't2': {'r': {'a': {'d': {'i': {'o': {'l': {'o': {'g': {'y': {self.model.ENTITY_KEY: [1]}}}}, 't': {'e': {'l': {'e': {'s': {'c': {'o': {'p': {'e': {self.model.ENTITY_KEY: [2]}}}}}}}}}}}}}}}}}
+ packed = self.model.pack_trie(tries, False)
+ assert packed == tries, '%s != %s' % (str(packed), str(tries))
+ packed = self.model.pack_trie(tries, True)
+ expected = {self.model.CONTENT_KEY: {'t1': {'r': {'adio': {'l': {'ogy': {self.model.ENTITY_KEY: [1]}}, 't': {'elescope': {self.model.ENTITY_KEY: [2]}}}}}, 't2': {'r': {'adio': {'l': {'ogy': {self.model.ENTITY_KEY: [1]}}, 't': {'elescope': {self.model.ENTITY_KEY: [2]}}}}}}}
+ assert packed == expected, '%s != %s' % (str(packed), str(expected))
def test_attribute_wrapper(self):
pass
From 6bc6265d1426d2dc31edb4e260b8ba726b546f36 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Fri, 14 Aug 2020 00:04:39 -0400
Subject: [PATCH 017/116] renamed attribute_wrapper() method
---
pilsner/model.py | 2 +-
pilsner/utility.py | 2 +-
test/ut_model.py | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index 7b4090a..4bba275 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -169,7 +169,7 @@ def pack_trie(self, trie, compressed):
ret[self.CONTENT_KEY][normalizer_name] = packed
return ret
- def attribute_wrapper(self, line_number, internal_id, subtrie, specs, columns):
+ def store_attributes(self, line_number, internal_id, subtrie, specs, columns):
if self.ENTITY_KEY not in subtrie:
subtrie[self.ENTITY_KEY] = []
subtrie[self.ENTITY_KEY].append(line_number)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 0f4506d..f80a710 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -84,7 +84,7 @@ def make_recognizer(self, model, filename, specs, word_separator, item_limit, co
if character not in subtrie:
subtrie[character] = {}
subtrie = subtrie[character]
- model.attribute_wrapper(line_number, internal_id, subtrie, specs, columns)
+ model.store_attributes(line_number, internal_id, subtrie, specs, columns)
line_count += 1
line_number += 1
if line_count > 0 and len(trie) > 3:
diff --git a/test/ut_model.py b/test/ut_model.py
index b5a7c8b..00c36ae 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -84,7 +84,7 @@ def test_pack_trie(self):
expected = {self.model.CONTENT_KEY: {'t1': {'r': {'adio': {'l': {'ogy': {self.model.ENTITY_KEY: [1]}}, 't': {'elescope': {self.model.ENTITY_KEY: [2]}}}}}, 't2': {'r': {'adio': {'l': {'ogy': {self.model.ENTITY_KEY: [1]}}, 't': {'elescope': {self.model.ENTITY_KEY: [2]}}}}}}}
assert packed == expected, '%s != %s' % (str(packed), str(expected))
- def test_attribute_wrapper(self):
+ def test_store_attributes(self):
pass
def test_get_dictionary_line(self):
From bbe1090e629563b805913b19d0733ee702afdec8 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Sun, 16 Aug 2020 01:00:02 -0400
Subject: [PATCH 018/116] test_store_attributes
---
test/ut_model.py | 22 +++++++++++++++++++++-
1 file changed, 21 insertions(+), 1 deletion(-)
diff --git a/test/ut_model.py b/test/ut_model.py
index 00c36ae..aef40cc 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -85,7 +85,27 @@ def test_pack_trie(self):
assert packed == expected, '%s != %s' % (str(packed), str(expected))
def test_store_attributes(self):
- pass
+ line_number = 123
+ internal_id = 456
+ subtrie = {}
+ specs = {'fields': {'attr1': (0, None, False, False), 'attr2': (1, None, False, True), 'attr3': (2, None, True, False), 'attr4': (3, ',', False, False)}}
+ columns = ['attr1_value', 'attr2_value', 'attr3_value', 'attr4_value_1,attr4_value_2,attr4_value_3']
+ self.model.create_recognizer_schema(self.model.cursor)
+ self.model.store_attributes(line_number, internal_id, subtrie, specs, columns)
+ rows = self.model.cursor.execute('select * from attrs;')
+ stored = set()
+ for row in rows:
+ stored.add(tuple(row))
+ expected = {
+ (123, 456, 'attr1', 'attr1_value'),
+ (123, 456, 'attr3', 'attr3_value'),
+ (123, 456, 'attr4', 'attr4_value_1'),
+ (123, 456, 'attr4', 'attr4_value_2'),
+ (123, 456, 'attr4', 'attr4_value_3')
+ }
+ assert len(stored) == len(expected), 'Expected to store %d rows (got %d instead)' % (len(expected), len(stored))
+ for entry in expected:
+ assert entry in stored, 'Entry %s was not stored' % str(entry)
def test_get_dictionary_line(self):
pass
From 15b402c20a78f294ddeb8fcca90241ae93667621 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 16 Aug 2020 21:39:08 -0400
Subject: [PATCH 019/116] test_get_dictionary_line
---
test/ut_model.py | 11 ++++++++++-
1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/test/ut_model.py b/test/ut_model.py
index aef40cc..b007437 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -108,7 +108,16 @@ def test_store_attributes(self):
assert entry in stored, 'Entry %s was not stored' % str(entry)
def test_get_dictionary_line(self):
- pass
+ specs = {'fields': {}, 'id': [0], 'tokenizer': None, 'value': None}
+ entity_ids = {}
+ line_numbers = {}
+ line_number = 1
+ line = 'entity_id\tstring_value\n'
+ column_separator = '\t'
+ cell_wall = '\n'
+ got_line = self.model.get_dictionary_line(specs, entity_ids, line_numbers, line_number, line, column_separator, cell_wall)
+ expected = (['entity_id', 'string_value'], 0)
+ assert got_line == expected, 'Expected %s, got %s' % (expected, got_line)
def test_get_dictionary_synonym(self):
pass
From 1b3eeb3e01fb701db52b1c0de2aa09da38db7111 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 16 Aug 2020 21:43:20 -0400
Subject: [PATCH 020/116] make sure filenames are sorted when loading model
---
pilsner/model.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index 4bba275..17e82de 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -100,7 +100,7 @@ def load(self, filename):
self[self.TOKENIZER_OPTION_KEY] = normalizers[self.TOKENIZER_OPTION_KEY]
self[self.DEFAULT_NORMALIZER_KEY] = normalizers[self.DEFAULT_NORMALIZER_KEY]
logging.debug('Loaded "%s"' % ('%s.normalizers' % (filename)))
- for _filename in os.listdir(os.path.dirname(filename)) if os.path.dirname(filename) != '' else os.listdir():
+ for _filename in sorted(os.listdir(os.path.dirname(filename))) if os.path.dirname(filename) != '' else sorted(os.listdir()):
if _filename.startswith(os.path.basename(filename) + '.') and _filename.endswith('.dictionary'):
with open('%s/%s' % (os.path.dirname(filename), _filename) if os.path.dirname(filename) != '' else _filename, mode='rb') as f:
dictionary = pickle.load(f)
From 478aad721b74ea188183fd3123c9d39baa34fdd1 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 16 Aug 2020 22:08:32 -0400
Subject: [PATCH 021/116] test get_dictionary_line, get_dictionary_synonym
---
pilsner/model.py | 1 +
test/ut_model.py | 39 ++++++++++++++++++++++++++++++++++-----
2 files changed, 35 insertions(+), 5 deletions(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index 17e82de..cebebf6 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -119,6 +119,7 @@ def add_normalizer(self, normalizer_name, filename, default=False):
logging.debug('Adding normalizer "%s" from "%s"' % (normalizer_name, filename))
normalizer = self.sic_builder.build_normalizer(filename)
self[self.NORMALIZER_KEY][normalizer_name] = normalizer
+ self.normalizer_map[normalizer_name] = normalizer_name
if len(self[self.NORMALIZER_KEY]) == 1 or default:
self[self.DEFAULT_NORMALIZER_KEY] = normalizer_name
logging.debug('Added normalizer "%s" from "%s"' % (normalizer_name, filename))
diff --git a/test/ut_model.py b/test/ut_model.py
index b007437..112efb8 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -108,19 +108,48 @@ def test_store_attributes(self):
assert entry in stored, 'Entry %s was not stored' % str(entry)
def test_get_dictionary_line(self):
- specs = {'fields': {}, 'id': [0], 'tokenizer': None, 'value': None}
+ specs = {
+ 'fields': {
+ 'normalizer': (0, None, True, False),
+ 'entity_id': (1, None, False, False),
+ 'entity_value': (2, None, False, True),
+ 'something_else': (3, ',', False, False)
+ },
+ 'id': (1, None, False, False),
+ 'tokenizer': (0, None, True, False),
+ 'value': (2, None, False, True)
+ }
entity_ids = {}
line_numbers = {}
line_number = 1
- line = 'entity_id\tstring_value\n'
+ line = 't1\tentity_id\tstring_value\tsome_attr,another_attr\n'
column_separator = '\t'
cell_wall = '\n'
got_line = self.model.get_dictionary_line(specs, entity_ids, line_numbers, line_number, line, column_separator, cell_wall)
- expected = (['entity_id', 'string_value'], 0)
- assert got_line == expected, 'Expected %s, got %s' % (expected, got_line)
+ expected = (['t1', 'entity_id', 'string_value', 'some_attr,another_attr'], 0)
+ assert got_line == expected, 'Expected %s, got %s' % (str(expected), str(got_line))
def test_get_dictionary_synonym(self):
- pass
+ # get_dictionary_synonym(self, columns, specs, word_separator, tokenizer_option=0)
+ columns = ['t1', 'entity_id', 'string_value', 'some_attr,another_attr']
+ specs = {
+ 'fields': {
+ 'normalizer': (0, None, True, False),
+ 'entity_id': (1, None, False, False),
+ 'entity_value': (2, None, False, True),
+ 'something_else': (3, ',', False, False)
+ },
+ 'id': (1, None, False, False),
+ 'tokenizer': (0, None, True, False),
+ 'value': (2, None, False, True)
+ }
+ word_separator = ' '
+ tokenizer_option = 0
+ self.model.add_normalizer('t1', 'test/assets/tokenizer1.xml')
+ self.model.add_normalizer('t2', 'test/assets/tokenizer2.xml')
+ got_synonym = self.model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
+ expected = ('string _ value', 't1')
+ assert got_synonym == expected, 'Expected %s, got %s' % (str(expected), str(got_synonym))
def test_next_trie(self):
pass
From d9e3cbee657a606e4863666b3080a548cdedf46a Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 16 Aug 2020 22:08:57 -0400
Subject: [PATCH 022/116] clean up
---
test/ut_model.py | 1 -
1 file changed, 1 deletion(-)
diff --git a/test/ut_model.py b/test/ut_model.py
index 112efb8..3852a34 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -130,7 +130,6 @@ def test_get_dictionary_line(self):
assert got_line == expected, 'Expected %s, got %s' % (str(expected), str(got_line))
def test_get_dictionary_synonym(self):
- # get_dictionary_synonym(self, columns, specs, word_separator, tokenizer_option=0)
columns = ['t1', 'entity_id', 'string_value', 'some_attr,another_attr']
specs = {
'fields': {
From 600eba0b7c97c3893b452cef1abeeca286c755dd Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 16 Aug 2020 22:14:44 -0400
Subject: [PATCH 023/116] test_next_trie
---
test/ut_model.py | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/test/ut_model.py b/test/ut_model.py
index 3852a34..7c82e68 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -151,7 +151,21 @@ def test_get_dictionary_synonym(self):
assert got_synonym == expected, 'Expected %s, got %s' % (str(expected), str(got_synonym))
def test_next_trie(self):
- pass
+ specs = {'specs': ('are', 'here')}
+ compressed = True
+ tokenizer_option = 0
+ word_separator = ' '
+ self.model.add_normalizer('t1', 'test/assets/tokenizer1.xml')
+ self.model.add_normalizer('t2', 'test/assets/tokenizer2.xml')
+ got_trie = self.model.next_trie(specs, compressed, tokenizer_option, word_separator)
+ expected = {
+ self.model.CONTENT_KEY: {'t1': {}, 't2': {}},
+ self.model.SPECS_KEY: specs,
+ self.model.COMPRESSED_KEY: int(compressed),
+ self.model.TOKENIZER_OPTION_KEY: tokenizer_option,
+ self.model.WORD_SEPARATOR_KEY: word_separator
+ }
+ assert got_trie == expected, 'Expected %s, got %s' % (str(expected), str(got_trie))
if __name__ == '__main__':
unittest.main()
From ae3444624725731a750210dde0dd8090ba234ed7 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 18 Aug 2020 22:06:57 -0400
Subject: [PATCH 024/116] bypassing normalizers
---
pilsner/model.py | 3 +++
pilsner/normalizer.bypass.xml | 4 ++++
2 files changed, 7 insertions(+)
create mode 100644 pilsner/normalizer.bypass.xml
diff --git a/pilsner/model.py b/pilsner/model.py
index cebebf6..b41bcbd 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -191,6 +191,7 @@ def get_dictionary_line(self, specs, entity_ids, line_numbers, line_number, line
if entity_id not in entity_ids:
entity_ids[entity_id] = len(entity_ids)
internal_id = entity_ids[entity_id]
+ line_numbers[line_number] = internal_id
return columns, internal_id
def get_dictionary_synonym(self, columns, specs, word_separator, tokenizer_option=0):
@@ -206,6 +207,8 @@ def get_dictionary_synonym(self, columns, specs, word_separator, tokenizer_optio
return synonym, normalizer_name
def next_trie(self, specs, compressed, tokenizer_option, word_separator):
+ if len(self[self.NORMALIZER_KEY]) == 0:
+ self.add_normalizer('bypass', '%s/normalizer.bypass.xml' % (os.path.abspath(os.path.dirname(__file__))))
new_trie = {
self.CONTENT_KEY: {normalizer_name: {} for normalizer_name in self[self.NORMALIZER_KEY]},
self.SPECS_KEY: specs,
diff --git a/pilsner/normalizer.bypass.xml b/pilsner/normalizer.bypass.xml
new file mode 100644
index 0000000..eab2309
--- /dev/null
+++ b/pilsner/normalizer.bypass.xml
@@ -0,0 +1,4 @@
+
+
+
+
From 1308e0bf5f1627df88cc2b6f770788421b694f24 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 18 Aug 2020 22:07:23 -0400
Subject: [PATCH 025/116] updated sandbox
---
test/assets/sample_dictionary.txt | 10 +++++-----
test/assets/tokenizer1.xml | 2 +-
test/assets/tokenizer2.xml | 2 +-
test/sandbox.py | 9 ++++++---
4 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/test/assets/sample_dictionary.txt b/test/assets/sample_dictionary.txt
index ad90ce7..815e777 100644
--- a/test/assets/sample_dictionary.txt
+++ b/test/assets/sample_dictionary.txt
@@ -1,7 +1,7 @@
-tokenizer1 entity2 acinic cell carcinomas C,D,E
-tokenizer1 entity2 acinic cell carcinomax D,E
-tokenizer2 entity1 acinic cell carcinomas A,B,C
-tokenizer1 entity1 acinic cell carcinoma A,B,C
-tokenizer2 entity1 afinic cell carcinoma A,B,C
+tokenizer1 entity2 awesome white refrigerators C,D,E
+tokenizer1 entity2 awesome white refrigeratorx D,E
+tokenizer2 entity1 awesome white refrigerators A,B,C
+tokenizer1 entity1 awesome white refrigerator A,B,C
+tokenizer2 entity1 awwsome white refrigerator A,B,C
tokenizer2 entity1 it A,B,C
tokenizer2 entity1 o A,B,C
diff --git a/test/assets/tokenizer1.xml b/test/assets/tokenizer1.xml
index 759fe98..9bb0bec 100644
--- a/test/assets/tokenizer1.xml
+++ b/test/assets/tokenizer1.xml
@@ -1,5 +1,5 @@
-
+
diff --git a/test/assets/tokenizer2.xml b/test/assets/tokenizer2.xml
index 16d6b32..b784769 100644
--- a/test/assets/tokenizer2.xml
+++ b/test/assets/tokenizer2.xml
@@ -1,5 +1,5 @@
-
+
diff --git a/test/sandbox.py b/test/sandbox.py
index e8499f1..6025faa 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -24,11 +24,13 @@ def save_it():
{'name': 'DType', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
{'name': 'MSID', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
{'name': 'Value', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
- {'name': 'smth', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'smth', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
specs = r.compile_dict_specs(fields)
+ _messages.clear()
r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=3, include_keywords=True)
- s = 'this is afinic cell carcinoma o carcinoma, damn it'
+ s = 'this is awwsome white refrigerator o refrigerator, is it not'
+ _messages.clear()
q = r.parse(m, s)
print(q)
m.save('.test_model')
@@ -36,8 +38,9 @@ def save_it():
def load_it():
rrr = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
m = pilsner.Model('.test_model')
- s = 'this is acinic cell carcinomas o carcinoma, damn it'
+ s = 'this is awesome white refrigerators o carcinoma, is it not'
s *= 10
+ _messages.clear()
q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
print(q)
From c4b80d0be8aa5ae7324c345faeb3aafde0296d1a Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 18 Aug 2020 22:39:16 -0400
Subject: [PATCH 026/116] utility uni tests (build recognizer)
---
test/ut_utility.py | 150 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 143 insertions(+), 7 deletions(-)
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 4220560..09adca1 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -5,10 +5,10 @@
class TestUtility(unittest.TestCase):
def setUp(self):
- pass
+ self.recognizer = pilsner.Recognizer()
def tearDown(self):
- pass
+ del(self.recognizer)
def test_init(self):
r = pilsner.Recognizer()
@@ -21,19 +21,155 @@ def test_del(self):
assert 'r' not in locals(), 'Instance of Recognizer class has not been destroyed'
def test_push_message(self):
- pass
+ messages = []
+ def callback_function(message):
+ messages.append(message)
+ self.recognizer.push_message('message 1', callback_function)
+ self.recognizer.push_message('message 2', callback_function)
+ expected = ['message 1', 'message 2']
+ assert messages == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(messages))
def test_compile_dict_specs(self):
- pass
+ fields = [
+ {'name': 'column 1', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
+ {'name': 'column 2', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'column 3', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
+ {'name': 'column 4', 'include': False, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'column 5', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
+ ]
+ specs = self.recognizer.compile_dict_specs(fields)
+ expected = {
+ 'fields': {
+ 'column 1': (0, None, True, False),
+ 'column 2': (1, None, False, False),
+ 'column 3': (2, None, False, True),
+ 'column 5': (4, ',', False, False)
+ },
+ 'id': (1, None, False, False),
+ 'tokenizer': (0, None, True, False),
+ 'value': (2, None, False, True)
+ }
+ assert specs == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(specs))
def test_make_recognizer(self):
- pass
+ fields = [
+ {'name': 'normalizer', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
+ {'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
+ {'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
+ ]
+ specs = self.recognizer.compile_dict_specs(fields)
+ model = pilsner.Model()
+ got_recognizer, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', cell_wall='\n', tokenizer_option=0)
+ expected_recognizer = [
+ {
+ '~specs': {
+ 'fields': {
+ 'normalizer': (0, None, True, False),
+ 'entity_id': (1, None, False, False),
+ 'label': (2, None, False, True),
+ 'some_attribute': (3, ',', False, False)
+ },
+ 'id': (1, None, False, False),
+ 'tokenizer': (0, None, True, False),
+ 'value': (2, None, False, True)
+ },
+ '~compressed': 1,
+ '~tokenizer_option': 0,
+ '~word_separator': ' ',
+ '~content': {
+ 'bypass': {'a': {'w': {'e': {'some white refrigerator': {'s': {'~i': [0, 2]}, 'x': {'~i': [1]}, '~i': [3]}}, 'w': {'some white refrigerator': {'~i': [4]}}}}, 'i': {'t': {'~i': [5]}}, 'o': {'~i': [6]}}
+ }
+ }
+ ]
+ expected_line_numbers = {
+ 0: 0,
+ 1: 0,
+ 2: 1,
+ 3: 1,
+ 4: 1,
+ 5: 1,
+ 6: 1
+ }
+ assert got_recognizer == expected_recognizer, '\nExpected\n%s\nGot\n%s' % (str(expected_recognizer), str(got_recognizer))
+ assert got_line_numbers == expected_line_numbers, '\nExpected\n%s\nGot\n%s' % (str(expected_line_numbers), str(got_line_numbers))
def test_make_keywords(self):
- pass
+ fields = [
+ {'name': 'normalizer', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
+ {'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
+ {'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
+ ]
+ specs = self.recognizer.compile_dict_specs(fields)
+ model = pilsner.Model()
+ _, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', cell_wall='\n', tokenizer_option=0)
+ keywords = self.recognizer.make_keywords(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, line_numbers=got_line_numbers, word_separator=' ', disambiguate_all=False, column_separator='\t', cell_wall='\n', tokenizer_option=0)
+ expected = {
+ model.CONTENT_KEY: {
+ 0: {'awesome', 'white', 'refrigerators', 'refrigeratorx'},
+ 1: {'awesome', 'refrigerator', 'white', 'o', 'refrigerators', 'it', 'awwsome'}
+ },
+ model.INTERNAL_ID_KEY: {
+ 0: 0,
+ 1: 0,
+ 2: 1,
+ 3: 1,
+ 4: 1,
+ 5: 1,
+ 6: 1
+ }
+ }
+ assert keywords == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(keywords))
def test_compile_model(self):
- pass
+ fields = [
+ {'name': 'normalizer', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
+ {'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
+ {'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
+ ]
+ specs = self.recognizer.compile_dict_specs(fields)
+ model = pilsner.Model()
+ model.add_normalizer('t1', 'test/assets/tokenizer1.xml')
+ model.add_normalizer('t2', 'test/assets/tokenizer2.xml')
+ model.normalizer_map = {
+ 'tokenizer1': 't1',
+ 'tokenizer2': 't2'
+ }
+ compiled = self.recognizer.compile_model(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', column_separator='\t', cell_wall='\n', include_keywords=True)
+ assert compiled == True, 'pilsner.Recognizer.compile_model() returned False which is not expected'
+ assert model.NORMALIZER_KEY in model, 'Model does not have model.NORMALIZER_KEY which is not expected'
+ assert model.DEFAULT_NORMALIZER_KEY in model, 'Model does not have model.DEFAULT_NORMALIZER_KEY which is not expected'
+ assert model.DICTIONARY_KEY in model, 'Model does not have model.DICTIONARY_KEY which is not expected'
+ assert model.KEYWORDS_KEY in model, 'Model does not have model.KEYWORDS_KEY which is not expected'
+ assert model.DATASOURCE_KEY in model, 'Model does not have model.DATASOURCE_KEY which is not expected'
+ assert model.WORD_SEPARATOR_KEY in model, 'Model does not have model.WORD_SEPARATOR_KEY which is not expected'
+ assert model.TOKENIZER_OPTION_KEY in model, 'Model does not have model.TOKENIZER_OPTION_KEY which is not expected'
+ assert 't1' in model[model.NORMALIZER_KEY], 'Normalizers do not include "t1"'
+ assert 't2' in model[model.NORMALIZER_KEY], 'Normalizers do not include "t2"'
+ expected_dictionary = [
+ {
+ '~specs': {
+ 'fields': {
+ 'normalizer': (0, None, True, False),
+ 'entity_id': (1, None, False, False),
+ 'label': (2, None, False, True),
+ 'some_attribute': (3, ',', False, False)
+ },
+ 'id': (1, None, False, False),
+ 'tokenizer': (0, None, True, False),
+ 'value': (2, None, False, True)
+ },
+ '~compressed': 1,
+ '~tokenizer_option': 0,
+ '~word_separator': ' ',
+ '~content': {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {'~i': [0]}}, 't': {'or': {'x': {'~i': [1]}, '~i': [3]}}}}}, 't2': {'a': {'w': {'e': {'some refrigerators': {'~i': [2]}}, 'w': {'some refrigerator': {'~i': [4]}}}}, 'i': {'t': {'~i': [5]}}, 'o': {'~i': [6]}}}
+ }
+ ]
+ expected_keywords = {'~content': {}, '~iid': {0: 0, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}}
+ assert model[model.DICTIONARY_KEY] == expected_dictionary, '\nExpected\n%s\nGot\n%s' % (str(expected_dictionary), str(model[model.DICTIONARY_KEY]))
+ assert model[model.KEYWORDS_KEY] == expected_keywords, '\nExpected\n%s\nGot\n%s' % (str(expected_keywords), str(model[model.KEYWORDS_KEY]))
def test_verify_keywords(self):
pass
From 9eb10deb4d1806c83086616cb3d92e3f79862191 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Wed, 19 Aug 2020 22:15:32 -0400
Subject: [PATCH 027/116] unit tests (generalize keys)
---
test/assets/sample_dictionary.txt | 2 ++
test/ut_utility.py | 41 +++++++++++++++++--------------
2 files changed, 25 insertions(+), 18 deletions(-)
diff --git a/test/assets/sample_dictionary.txt b/test/assets/sample_dictionary.txt
index 815e777..62e9db3 100644
--- a/test/assets/sample_dictionary.txt
+++ b/test/assets/sample_dictionary.txt
@@ -1,7 +1,9 @@
tokenizer1 entity2 awesome white refrigerators C,D,E
tokenizer1 entity2 awesome white refrigeratorx D,E
+tokenizer2 entity2 conflicting refrigerator D,E
tokenizer2 entity1 awesome white refrigerators A,B,C
tokenizer1 entity1 awesome white refrigerator A,B,C
tokenizer2 entity1 awwsome white refrigerator A,B,C
tokenizer2 entity1 it A,B,C
tokenizer2 entity1 o A,B,C
+tokenizer2 entity1 conflicting refrigerator A,B,C
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 09adca1..7afc98d 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -63,7 +63,7 @@ def test_make_recognizer(self):
got_recognizer, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', cell_wall='\n', tokenizer_option=0)
expected_recognizer = [
{
- '~specs': {
+ model.SPECS_KEY: {
'fields': {
'normalizer': (0, None, True, False),
'entity_id': (1, None, False, False),
@@ -74,22 +74,24 @@ def test_make_recognizer(self):
'tokenizer': (0, None, True, False),
'value': (2, None, False, True)
},
- '~compressed': 1,
- '~tokenizer_option': 0,
- '~word_separator': ' ',
- '~content': {
- 'bypass': {'a': {'w': {'e': {'some white refrigerator': {'s': {'~i': [0, 2]}, 'x': {'~i': [1]}, '~i': [3]}}, 'w': {'some white refrigerator': {'~i': [4]}}}}, 'i': {'t': {'~i': [5]}}, 'o': {'~i': [6]}}
+ model.COMPRESSED_KEY: 1,
+ model.TOKENIZER_OPTION_KEY: 0,
+ model.WORD_SEPARATOR_KEY: ' ',
+ model.CONTENT_KEY: {
+ 'bypass': {'a': {'w': {'e': {'some white refrigerator': {'s': {model.ENTITY_KEY: [0, 3]}, 'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}, 'w': {'some white refrigerator': {model.ENTITY_KEY: [5]}}}}, 'c': {'onflicting refrigirator': {model.ENTITY_KEY: [2, 8]}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}
}
}
]
expected_line_numbers = {
0: 0,
1: 0,
- 2: 1,
+ 2: 0,
3: 1,
4: 1,
5: 1,
- 6: 1
+ 6: 1,
+ 7: 1,
+ 8: 1
}
assert got_recognizer == expected_recognizer, '\nExpected\n%s\nGot\n%s' % (str(expected_recognizer), str(got_recognizer))
assert got_line_numbers == expected_line_numbers, '\nExpected\n%s\nGot\n%s' % (str(expected_line_numbers), str(got_line_numbers))
@@ -107,17 +109,19 @@ def test_make_keywords(self):
keywords = self.recognizer.make_keywords(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, line_numbers=got_line_numbers, word_separator=' ', disambiguate_all=False, column_separator='\t', cell_wall='\n', tokenizer_option=0)
expected = {
model.CONTENT_KEY: {
- 0: {'awesome', 'white', 'refrigerators', 'refrigeratorx'},
- 1: {'awesome', 'refrigerator', 'white', 'o', 'refrigerators', 'it', 'awwsome'}
+ 0: {'refrigirator', 'white', 'awesome', 'conflicting', 'refrigerators', 'refrigeratorx'},
+ 1: {'o', 'white', 'conflicting', 'refrigerators', 'awwsome', 'it', 'refrigerator', 'awesome', 'refrigirator'}
},
model.INTERNAL_ID_KEY: {
0: 0,
1: 0,
- 2: 1,
+ 2: 0,
3: 1,
4: 1,
5: 1,
- 6: 1
+ 6: 1,
+ 7: 1,
+ 8: 1
}
}
assert keywords == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(keywords))
@@ -150,7 +154,7 @@ def test_compile_model(self):
assert 't2' in model[model.NORMALIZER_KEY], 'Normalizers do not include "t2"'
expected_dictionary = [
{
- '~specs': {
+ model.SPECS_KEY: {
'fields': {
'normalizer': (0, None, True, False),
'entity_id': (1, None, False, False),
@@ -161,13 +165,14 @@ def test_compile_model(self):
'tokenizer': (0, None, True, False),
'value': (2, None, False, True)
},
- '~compressed': 1,
- '~tokenizer_option': 0,
- '~word_separator': ' ',
- '~content': {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {'~i': [0]}}, 't': {'or': {'x': {'~i': [1]}, '~i': [3]}}}}}, 't2': {'a': {'w': {'e': {'some refrigerators': {'~i': [2]}}, 'w': {'some refrigerator': {'~i': [4]}}}}, 'i': {'t': {'~i': [5]}}, 'o': {'~i': [6]}}}
+ model.COMPRESSED_KEY: 1,
+ model.TOKENIZER_OPTION_KEY: 0,
+ model.WORD_SEPARATOR_KEY: ' ',
+ model.CONTENT_KEY: {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}}, 't2': {'c': {'onflicting refrigirator': {model.ENTITY_KEY: [2, 8]}}, 'a': {'w': {'e': {'some refrigerators': {model.ENTITY_KEY: [3]}}, 'w': {'some refrigerator': {model.ENTITY_KEY: [5]}}}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}}
}
]
- expected_keywords = {'~content': {}, '~iid': {0: 0, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1}}
+
+ expected_keywords = {model.CONTENT_KEY: {0: {'refrigera', 'refrigeratorx', 'tors', 'refrigirator', 'white', 'awesome', 'conflicting'}, 1: {'it', 'o', 'awwsome', 'white', 'refrigerator', 'refrigirator', 'conflicting', 'refrigerators', 'awesome'}}, model.INTERNAL_ID_KEY: {0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}}
assert model[model.DICTIONARY_KEY] == expected_dictionary, '\nExpected\n%s\nGot\n%s' % (str(expected_dictionary), str(model[model.DICTIONARY_KEY]))
assert model[model.KEYWORDS_KEY] == expected_keywords, '\nExpected\n%s\nGot\n%s' % (str(expected_keywords), str(model[model.KEYWORDS_KEY]))
From 17278fe4e57c26d18d7ab4b6675545fdca3e92f5 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Wed, 19 Aug 2020 22:15:48 -0400
Subject: [PATCH 028/116] keywords issue spotted
---
pilsner/utility.py | 33 +++++++++++++++++----------------
test/sandbox.py | 9 +++++----
2 files changed, 22 insertions(+), 20 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index f80a710..e308760 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -320,10 +320,10 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
ret.append(shorter_alternative)
elif shorter_alternative:
ret.append(shorter_alternative)
- if model[model.KEYWORDS_KEY] is not None:
- self.verify_keywords(model, ret, source_string, word_separator)
rets += ret
current_trie_index += 1
+ if model[model.KEYWORDS_KEY] is not None:
+ self.verify_keywords(model, rets, source_string, word_separator)
self.push_message(progress_to, self.callback_progress)
self.logger('Done.')
return rets
@@ -341,20 +341,21 @@ def flatten_spans(self, layers):
for _attr_name in _attrs:
for _attr_value in _attrs[_attr_name]:
all_entries.append(tuple([_left, _right, _attr_name, _attr_value]))
- all_entries = sorted(sorted(all_entries, key=lambda x: -x[1]), key=lambda x: x[0])
- filtered_entries = [all_entries[0]]
- for i in range(1, len(all_entries)):
- q = all_entries[i]
- if (filtered_entries[-1][0] <= q[0] < filtered_entries[-1][1] and filtered_entries[-1][0] < q[1] < filtered_entries[-1][1]) or (filtered_entries[-1][0] < q[0] < filtered_entries[-1][1] and filtered_entries[-1][0] < q[1] <= filtered_entries[-1][1]):
- continue
- filtered_entries.append(q)
- for entry in filtered_entries:
- _location, _attr_name, _attr_value = tuple([int(entry[0]), int(entry[1])]), str(entry[2]), str(entry[3])
- if _location not in ret:
- ret[_location] = {}
- if _attr_name not in ret[_location]:
- ret[_location][_attr_name] = set()
- ret[_location][_attr_name].add(_attr_value)
+ if len(all_entries) > 0:
+ all_entries = sorted(sorted(all_entries, key=lambda x: -x[1]), key=lambda x: x[0])
+ filtered_entries = [all_entries[0]]
+ for i in range(1, len(all_entries)):
+ q = all_entries[i]
+ if (filtered_entries[-1][0] <= q[0] < filtered_entries[-1][1] and filtered_entries[-1][0] < q[1] < filtered_entries[-1][1]) or (filtered_entries[-1][0] < q[0] < filtered_entries[-1][1] and filtered_entries[-1][0] < q[1] <= filtered_entries[-1][1]):
+ continue
+ filtered_entries.append(q)
+ for entry in filtered_entries:
+ _location, _attr_name, _attr_value = tuple([int(entry[0]), int(entry[1])]), str(entry[2]), str(entry[3])
+ if _location not in ret:
+ ret[_location] = {}
+ if _attr_name not in ret[_location]:
+ ret[_location][_attr_name] = set()
+ ret[_location][_attr_name].add(_attr_value)
return ret
def reduce_spans(self, segments):
diff --git a/test/sandbox.py b/test/sandbox.py
index 6025faa..54229bc 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -29,23 +29,24 @@ def save_it():
specs = r.compile_dict_specs(fields)
_messages.clear()
r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=3, include_keywords=True)
- s = 'this is awwsome white refrigerator o refrigerator, is it not'
+ print(m['~keywords'])
+ s = 'this is awwsome white refrigerator o refrigerator, is it tors not conflicting refrigerator hey'
_messages.clear()
q = r.parse(m, s)
print(q)
- m.save('.test_model')
+ #m.save('.test_model')
def load_it():
rrr = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
m = pilsner.Model('.test_model')
- s = 'this is awesome white refrigerators o carcinoma, is it not'
+ s = 'this is awesome white refrigerators o refrigerator, is it not'
s *= 10
_messages.clear()
q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
print(q)
save_it()
-load_it()
+#load_it()
#segments = [tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])]
#r = Recognizer()
From 1cd496b122357ea0fdafaa8b13501840f8486e69 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Wed, 19 Aug 2020 22:21:44 -0400
Subject: [PATCH 029/116] unit tests
---
test/ut_utility.py | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 7afc98d..c316b08 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -78,7 +78,7 @@ def test_make_recognizer(self):
model.TOKENIZER_OPTION_KEY: 0,
model.WORD_SEPARATOR_KEY: ' ',
model.CONTENT_KEY: {
- 'bypass': {'a': {'w': {'e': {'some white refrigerator': {'s': {model.ENTITY_KEY: [0, 3]}, 'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}, 'w': {'some white refrigerator': {model.ENTITY_KEY: [5]}}}}, 'c': {'onflicting refrigirator': {model.ENTITY_KEY: [2, 8]}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}
+ 'bypass': {'a': {'w': {'e': {'some white refrigerator': {'s': {model.ENTITY_KEY: [0, 3]}, 'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}, 'w': {'some white refrigerator': {model.ENTITY_KEY: [5]}}}}, 'c': {'onflicting refrigerator': {model.ENTITY_KEY: [2, 8]}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}
}
}
]
@@ -109,8 +109,8 @@ def test_make_keywords(self):
keywords = self.recognizer.make_keywords(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, line_numbers=got_line_numbers, word_separator=' ', disambiguate_all=False, column_separator='\t', cell_wall='\n', tokenizer_option=0)
expected = {
model.CONTENT_KEY: {
- 0: {'refrigirator', 'white', 'awesome', 'conflicting', 'refrigerators', 'refrigeratorx'},
- 1: {'o', 'white', 'conflicting', 'refrigerators', 'awwsome', 'it', 'refrigerator', 'awesome', 'refrigirator'}
+ 0: {'refrigerator', 'white', 'awesome', 'conflicting', 'refrigerators', 'refrigeratorx'},
+ 1: {'o', 'white', 'conflicting', 'refrigerators', 'awwsome', 'it', 'refrigerator', 'awesome', 'refrigerator'}
},
model.INTERNAL_ID_KEY: {
0: 0,
@@ -168,11 +168,10 @@ def test_compile_model(self):
model.COMPRESSED_KEY: 1,
model.TOKENIZER_OPTION_KEY: 0,
model.WORD_SEPARATOR_KEY: ' ',
- model.CONTENT_KEY: {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}}, 't2': {'c': {'onflicting refrigirator': {model.ENTITY_KEY: [2, 8]}}, 'a': {'w': {'e': {'some refrigerators': {model.ENTITY_KEY: [3]}}, 'w': {'some refrigerator': {model.ENTITY_KEY: [5]}}}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}}
+ model.CONTENT_KEY: {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}}, 't2': {'c': {'onflicting refrigerator': {model.ENTITY_KEY: [2, 8]}}, 'a': {'w': {'e': {'some refrigerators': {model.ENTITY_KEY: [3]}}, 'w': {'some refrigerator': {model.ENTITY_KEY: [5]}}}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}}
}
]
-
- expected_keywords = {model.CONTENT_KEY: {0: {'refrigera', 'refrigeratorx', 'tors', 'refrigirator', 'white', 'awesome', 'conflicting'}, 1: {'it', 'o', 'awwsome', 'white', 'refrigerator', 'refrigirator', 'conflicting', 'refrigerators', 'awesome'}}, model.INTERNAL_ID_KEY: {0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}}
+ expected_keywords = {model.CONTENT_KEY: {0: {'refrigera', 'refrigeratorx', 'tors', 'refrigerator', 'white', 'awesome', 'conflicting'}, 1: {'it', 'o', 'awwsome', 'white', 'refrigerator', 'refrigerator', 'conflicting', 'refrigerators', 'awesome'}}, model.INTERNAL_ID_KEY: {0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}}
assert model[model.DICTIONARY_KEY] == expected_dictionary, '\nExpected\n%s\nGot\n%s' % (str(expected_dictionary), str(model[model.DICTIONARY_KEY]))
assert model[model.KEYWORDS_KEY] == expected_keywords, '\nExpected\n%s\nGot\n%s' % (str(expected_keywords), str(model[model.KEYWORDS_KEY]))
From b8c41124333038574a2e4faa6992f9ad74a3d498 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Wed, 19 Aug 2020 22:46:40 -0400
Subject: [PATCH 030/116] unit test (verify_keywords(), unpack_trie())
---
test/ut_utility.py | 68 ++++++++++++++++++++++++++++++++++------------
1 file changed, 51 insertions(+), 17 deletions(-)
diff --git a/test/ut_utility.py b/test/ut_utility.py
index c316b08..2d589a4 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -10,6 +10,24 @@ def setUp(self):
def tearDown(self):
del(self.recognizer)
+ def compile_test_model(self):
+ fields = [
+ {'name': 'normalizer', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
+ {'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
+ {'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
+ ]
+ specs = self.recognizer.compile_dict_specs(fields)
+ model = pilsner.Model()
+ model.add_normalizer('t1', 'test/assets/tokenizer1.xml')
+ model.add_normalizer('t2', 'test/assets/tokenizer2.xml')
+ model.normalizer_map = {
+ 'tokenizer1': 't1',
+ 'tokenizer2': 't2'
+ }
+ compiled = self.recognizer.compile_model(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', column_separator='\t', cell_wall='\n', include_keywords=True)
+ return compiled, model
+
def test_init(self):
r = pilsner.Recognizer()
assert 'r' in locals(), 'Instance of Recognizer class has not been created'
@@ -127,21 +145,7 @@ def test_make_keywords(self):
assert keywords == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(keywords))
def test_compile_model(self):
- fields = [
- {'name': 'normalizer', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
- {'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
- {'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
- {'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
- ]
- specs = self.recognizer.compile_dict_specs(fields)
- model = pilsner.Model()
- model.add_normalizer('t1', 'test/assets/tokenizer1.xml')
- model.add_normalizer('t2', 'test/assets/tokenizer2.xml')
- model.normalizer_map = {
- 'tokenizer1': 't1',
- 'tokenizer2': 't2'
- }
- compiled = self.recognizer.compile_model(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', column_separator='\t', cell_wall='\n', include_keywords=True)
+ compiled, model = self.compile_test_model()
assert compiled == True, 'pilsner.Recognizer.compile_model() returned False which is not expected'
assert model.NORMALIZER_KEY in model, 'Model does not have model.NORMALIZER_KEY which is not expected'
assert model.DEFAULT_NORMALIZER_KEY in model, 'Model does not have model.DEFAULT_NORMALIZER_KEY which is not expected'
@@ -176,10 +180,40 @@ def test_compile_model(self):
assert model[model.KEYWORDS_KEY] == expected_keywords, '\nExpected\n%s\nGot\n%s' % (str(expected_keywords), str(model[model.KEYWORDS_KEY]))
def test_verify_keywords(self):
- pass
+ _, model = self.compile_test_model()
+ src = 'awwsome conflicting refrigerator'
+ recognized = [
+ (
+ [2, 8],
+ {
+ 2: {'entity_id': ['entity2'], 'normalizer': ['tokenizer2'], 'some_attribute': ['D', 'E']},
+ 8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer2'], 'some_attribute': ['A', 'B', 'C']}
+ },
+ 'conflicting refrigerator',
+ 8,
+ 31
+ )
+ ]
+ self.recognizer.verify_keywords(model=model, recognized=recognized, src=src, word_separator=' ')
+ expected = [
+ (
+ [8],
+ {
+ 8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer2'], 'some_attribute': ['A', 'B', 'C']}
+ },
+ 'conflicting refrigerator',
+ 8,
+ 31
+ )
+ ]
+ assert recognized == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(recognized))
def test_unpack_trie(self):
- pass
+ _, model = self.compile_test_model()
+ packed_trie = {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}
+ expected = {'w': {'e': {'s': {'o': {'m': {'e': {' ': {'w': {'h': {'i': {'t': {'e': {' ': {'r': {'e': {'f': {'r': {'i': {'g': {'e': {'r': {'a': {' ': {'tors': {'~i': [0]}}, 't': {'or': {'x': {'~i': [1]}, '~i': [4]}}}}}}}}}}}}}}}}}}}}}}}}}
+ unpacked_trie = self.recognizer.unpack_trie(model=model, packed_trie=packed_trie, compressed=True)
+ assert unpacked_trie == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(unpacked_trie))
def test_check_attrs(self):
pass
From 210a67d5dca4ee7c6d1b51856f6c10bceb570142 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Thu, 20 Aug 2020 15:27:12 -0400
Subject: [PATCH 031/116] flag indicating if a span was verified by kwds
---
pilsner/utility.py | 13 ++++++++-----
test/ut_utility.py | 6 ++++--
2 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index e308760..ff55238 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -178,7 +178,7 @@ def verify_keywords(self, model, recognized, src, word_separator):
winner_id.clear()
if kwd_score[i] == winner_score:
winner_id.add(i)
- recognized[k] = tuple([[x for x in recognized[k][0] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id]] + [{x: recognized[k][1][x] for x in recognized[k][1] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id}] + list(recognized[k])[2:])
+ recognized[k] = tuple([[x for x in recognized[k][0] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id]] + [{x: recognized[k][1][x] for x in recognized[k][1] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id}] + list(recognized[k])[2:5] + [True])
def unpack_trie(self, model, packed_trie, compressed):
"""TODO: add docstring here
@@ -232,6 +232,8 @@ def attribute_unpacker(self, cur, leaf_ids, include_query, exclude_query, proces
def spot_entities(self, model, source_string, normalizer_name, include_query='', exclude_query='', process_exclude=False, attrs_out_query='', progress_from=0, progress_to=100):
# TODO: review for refactoring
+ def blend_in(_ret, _rets):
+ _rets += _ret
self.logger('Analyzing "%s"... ' % (source_string))
rets = []
this_progress_position = 0
@@ -273,7 +275,7 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
if found_object:
identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
- shorter_alternative = (identified[0], identified[1], string_so_far, start_index + 1, end_index)
+ shorter_alternative = (identified[0], identified[1], string_so_far, start_index + 1, end_index, False)
if character in subtrie:
if character == word_separator and temporary_index == -1:
temporary_index = current_index
@@ -286,7 +288,7 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
if found_object:
identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
- ret.append((identified[0], identified[1], string_so_far, start_index + 1, end_index))
+ ret.append((identified[0], identified[1], string_so_far, start_index + 1, end_index, False))
shorter_alternative = None
else:
if shorter_alternative:
@@ -315,12 +317,13 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
if found_object:
identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
- ret.append((identified[0], identified[1], string_so_far, start_index + 1, current_index - 1))
+ ret.append((identified[0], identified[1], string_so_far, start_index + 1, current_index - 1, False))
elif shorter_alternative:
ret.append(shorter_alternative)
elif shorter_alternative:
ret.append(shorter_alternative)
- rets += ret
+ #rets += ret
+ blend_in(ret, rets)
current_trie_index += 1
if model[model.KEYWORDS_KEY] is not None:
self.verify_keywords(model, rets, source_string, word_separator)
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 2d589a4..353749d 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -191,7 +191,8 @@ def test_verify_keywords(self):
},
'conflicting refrigerator',
8,
- 31
+ 31,
+ False
)
]
self.recognizer.verify_keywords(model=model, recognized=recognized, src=src, word_separator=' ')
@@ -203,7 +204,8 @@ def test_verify_keywords(self):
},
'conflicting refrigerator',
8,
- 31
+ 31,
+ True
)
]
assert recognized == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(recognized))
From ce4f76653842194a840c403f589f60fa6c4b14ee Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Thu, 20 Aug 2020 16:50:21 -0400
Subject: [PATCH 032/116] do not tag disambiguated spans
---
pilsner/utility.py | 8 ++++----
test/sandbox.py | 2 +-
test/ut_utility.py | 6 ++----
3 files changed, 7 insertions(+), 9 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index ff55238..05665bd 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -178,7 +178,7 @@ def verify_keywords(self, model, recognized, src, word_separator):
winner_id.clear()
if kwd_score[i] == winner_score:
winner_id.add(i)
- recognized[k] = tuple([[x for x in recognized[k][0] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id]] + [{x: recognized[k][1][x] for x in recognized[k][1] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id}] + list(recognized[k])[2:5] + [True])
+ recognized[k] = tuple([[x for x in recognized[k][0] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id]] + [{x: recognized[k][1][x] for x in recognized[k][1] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id}] + list(recognized[k])[2:5])
def unpack_trie(self, model, packed_trie, compressed):
"""TODO: add docstring here
@@ -275,7 +275,7 @@ def blend_in(_ret, _rets):
found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
if found_object:
identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
- shorter_alternative = (identified[0], identified[1], string_so_far, start_index + 1, end_index, False)
+ shorter_alternative = (identified[0], identified[1], string_so_far, start_index + 1, end_index)
if character in subtrie:
if character == word_separator and temporary_index == -1:
temporary_index = current_index
@@ -288,7 +288,7 @@ def blend_in(_ret, _rets):
found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
if found_object:
identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
- ret.append((identified[0], identified[1], string_so_far, start_index + 1, end_index, False))
+ ret.append((identified[0], identified[1], string_so_far, start_index + 1, end_index))
shorter_alternative = None
else:
if shorter_alternative:
@@ -317,7 +317,7 @@ def blend_in(_ret, _rets):
found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
if found_object:
identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
- ret.append((identified[0], identified[1], string_so_far, start_index + 1, current_index - 1, False))
+ ret.append((identified[0], identified[1], string_so_far, start_index + 1, current_index - 1))
elif shorter_alternative:
ret.append(shorter_alternative)
elif shorter_alternative:
diff --git a/test/sandbox.py b/test/sandbox.py
index 54229bc..fd74c36 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -28,7 +28,7 @@ def save_it():
]
specs = r.compile_dict_specs(fields)
_messages.clear()
- r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=3, include_keywords=True)
+ r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=33, include_keywords=True)
print(m['~keywords'])
s = 'this is awwsome white refrigerator o refrigerator, is it tors not conflicting refrigerator hey'
_messages.clear()
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 353749d..2d589a4 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -191,8 +191,7 @@ def test_verify_keywords(self):
},
'conflicting refrigerator',
8,
- 31,
- False
+ 31
)
]
self.recognizer.verify_keywords(model=model, recognized=recognized, src=src, word_separator=' ')
@@ -204,8 +203,7 @@ def test_verify_keywords(self):
},
'conflicting refrigerator',
8,
- 31,
- True
+ 31
)
]
assert recognized == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(recognized))
From 5b9134d877e62742503bb2672bda685c2c4a2d27 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Fri, 21 Aug 2020 00:35:33 -0400
Subject: [PATCH 033/116] disambiguation (wip)
---
pilsner/utility.py | 83 ++++++++++++++++++++++++++++---
test/assets/sample_dictionary.txt | 2 +-
test/sandbox.py | 2 +-
3 files changed, 79 insertions(+), 8 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 05665bd..c99363f 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -325,12 +325,82 @@ def blend_in(_ret, _rets):
#rets += ret
blend_in(ret, rets)
current_trie_index += 1
- if model[model.KEYWORDS_KEY] is not None:
- self.verify_keywords(model, rets, source_string, word_separator)
+ #if model[model.KEYWORDS_KEY] is not None:
+ # self.verify_keywords(model, rets, source_string, word_separator)
self.push_message(progress_to, self.callback_progress)
self.logger('Done.')
return rets
+ def disambiguate(self, model, recognized, srcs, word_separator):
+ id_list = [[model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] for x in rec[0] if x in model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY]] for rec in sorted(recognized, key=lambda x: x[2])]
+ for k in range(len(id_list)):
+ ids = id_list[k]
+ if len(ids) < 2:
+ continue
+ si = []
+ src = []
+ ei = []
+ tokens = []
+ s_tokens = []
+ for j in range(len(ids)):
+ #si[j] = 0
+ si.append(0)
+ #src[j] = srcs[recognized[k][4][j]]
+ src.append(srcs[recognized[k][4][j]])
+ #ei[j] = len(src[j])
+ ei.append(len(src[j]))
+ if k > 0:
+ si[j] = recognized[k-1][3]
+ if k < len(id_list) - 1:
+ ei[j] = recognized[k+1][2]
+ #tokens[j] = src[j][si[j]:ei[j]]
+ tokens.append(src[j][si[j]:ei[j]])
+ #s_tokens[j] = set(tokens[j].split(word_separator))
+ s_tokens.append(set(tokens[j].split(word_separator)))
+ tmp = {i: model[model.KEYWORDS_KEY][model.CONTENT_KEY][i] if i in model[model.KEYWORDS_KEY][model.CONTENT_KEY] else set() for i in ids}
+ kwd = {i: tmp[i] - tmp[j] for i in tmp for j in tmp if j != i}
+ winner_score = 0
+ winner_id = set()
+ kwd_score = {}
+ for i in kwd:
+ kwd_score[i] = len(kwd[i].intersection(s_tokens[i]))
+ if kwd_score[i] > winner_score:
+ winner_score = kwd_score[i]
+ winner_id.clear()
+ if kwd_score[i] == winner_score:
+ winner_id.add(i)
+ recognized[k] = tuple([[x for x in recognized[k][0] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id]] + [{x: recognized[k][1][x] for x in recognized[k][1] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id}] + list(recognized[k])[2:])
+
+ def flatten_layers(self, model, layers):
+ ret = []
+ spans = {}
+ srcs = []
+ for i in range(0, len(layers)):
+ layer = layers[i]
+ _map = layer[0]
+ _recognized = layer[1]
+ _src = layer[2]
+ srcs.append(_src)
+ for span in _recognized:
+ location = tuple([_map[span[3]], _map[span[4]]])
+ if location not in spans:
+ spans[location] = []
+ spans[location].append(tuple([span[0], span[1], [i] * len(span[0]), span[3], span[4]]))
+ new_layers = []
+ for location in spans:
+ new_left = location[0]
+ new_right = location[1]
+ new_ids = []
+ new_attrs = {}
+ new_srcids = []
+ for item in spans[location]:
+ new_ids += item[0]
+ new_attrs = {**new_attrs, **item[1]}
+ new_srcids += item[2]
+ new_layers.append(tuple([new_ids, new_attrs, new_left, new_right, new_srcids]))
+ self.disambiguate(model, new_layers, srcs, ' ')
+ return layers
+
def flatten_spans(self, layers):
ret = {}
all_entries = []
@@ -423,10 +493,11 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
progress_from = current_normalizer_index * spot_progress_share
progress_to = (current_normalizer_index + 1) * spot_progress_share
parsed = self.spot_entities(model, normalized_string, normalizer_name, include_query, exclude_query, process_exclude, attrs_out_query, progress_from=progress_from, progress_to=progress_to)
- rets.append((character_map, parsed))
+ rets.append((character_map, parsed, normalized_string))
current_normalizer_index += 1
- flattened = self.flatten_spans(rets)
- locations = self.reduce_spans(flattened.keys())
- ret = {location: flattened[location] for location in locations}
+ layers = self.flatten_layers(model, rets)
+ spans = self.flatten_spans(layers)
+ locations = self.reduce_spans(spans.keys())
+ ret = {location: spans[location] for location in locations}
self.logger('Done parsing text.')
return ret
diff --git a/test/assets/sample_dictionary.txt b/test/assets/sample_dictionary.txt
index 62e9db3..ee98b2a 100644
--- a/test/assets/sample_dictionary.txt
+++ b/test/assets/sample_dictionary.txt
@@ -6,4 +6,4 @@ tokenizer1 entity1 awesome white refrigerator A,B,C
tokenizer2 entity1 awwsome white refrigerator A,B,C
tokenizer2 entity1 it A,B,C
tokenizer2 entity1 o A,B,C
-tokenizer2 entity1 conflicting refrigerator A,B,C
+tokenizer1 entity1 conflicting refrigerator A,B,C
diff --git a/test/sandbox.py b/test/sandbox.py
index fd74c36..54229bc 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -28,7 +28,7 @@ def save_it():
]
specs = r.compile_dict_specs(fields)
_messages.clear()
- r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=33, include_keywords=True)
+ r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=3, include_keywords=True)
print(m['~keywords'])
s = 'this is awwsome white refrigerator o refrigerator, is it tors not conflicting refrigerator hey'
_messages.clear()
From e8853587e5044bfa6fe5aac57becb20855bf54f4 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Fri, 21 Aug 2020 02:16:08 -0400
Subject: [PATCH 034/116] disambiguation (works; wip)
---
pilsner/utility.py | 43 +++++++++++++++++++++++--------------------
1 file changed, 23 insertions(+), 20 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index c99363f..c6e1990 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -332,7 +332,8 @@ def blend_in(_ret, _rets):
return rets
def disambiguate(self, model, recognized, srcs, word_separator):
- id_list = [[model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] for x in rec[0] if x in model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY]] for rec in sorted(recognized, key=lambda x: x[2])]
+ _recognized = sorted(recognized, key=lambda x: x[2])
+ id_list = [[model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] for x in rec[0] if x in model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY]] for rec in _recognized]
for k in range(len(id_list)):
ids = id_list[k]
if len(ids) < 2:
@@ -346,13 +347,13 @@ def disambiguate(self, model, recognized, srcs, word_separator):
#si[j] = 0
si.append(0)
#src[j] = srcs[recognized[k][4][j]]
- src.append(srcs[recognized[k][4][j]])
+ src.append(srcs[_recognized[k][4][j]])
#ei[j] = len(src[j])
ei.append(len(src[j]))
if k > 0:
- si[j] = recognized[k-1][3]
+ si[j] = _recognized[k-1][5][0][1]
if k < len(id_list) - 1:
- ei[j] = recognized[k+1][2]
+ ei[j] = _recognized[k+1][5][0][0]
#tokens[j] = src[j][si[j]:ei[j]]
tokens.append(src[j][si[j]:ei[j]])
#s_tokens[j] = set(tokens[j].split(word_separator))
@@ -369,10 +370,10 @@ def disambiguate(self, model, recognized, srcs, word_separator):
winner_id.clear()
if kwd_score[i] == winner_score:
winner_id.add(i)
- recognized[k] = tuple([[x for x in recognized[k][0] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id]] + [{x: recognized[k][1][x] for x in recognized[k][1] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id}] + list(recognized[k])[2:])
+ _recognized[k] = tuple([[x for x in _recognized[k][0] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id]] + [{x: _recognized[k][1][x] for x in _recognized[k][1] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id}] + list(_recognized[k])[2:])
+ return _recognized
def flatten_layers(self, model, layers):
- ret = []
spans = {}
srcs = []
for i in range(0, len(layers)):
@@ -393,27 +394,29 @@ def flatten_layers(self, model, layers):
new_ids = []
new_attrs = {}
new_srcids = []
+ new_locations = []
for item in spans[location]:
new_ids += item[0]
new_attrs = {**new_attrs, **item[1]}
new_srcids += item[2]
- new_layers.append(tuple([new_ids, new_attrs, new_left, new_right, new_srcids]))
- self.disambiguate(model, new_layers, srcs, ' ')
- return layers
+ new_locations.append(tuple([item[3], item[4]]))
+ new_layers.append(tuple([new_ids, new_attrs, new_left, new_right, new_srcids, new_locations]))
+ if model[model.KEYWORDS_KEY] is not None:
+ new_layers = self.disambiguate(model, new_layers, srcs, ' ')
+ pass
+ ret = [x[0:4] for x in new_layers]
+ return ret
- def flatten_spans(self, layers):
+ def flatten_spans(self, recognized):
ret = {}
all_entries = []
- for layer in layers:
- _map = layer[0]
- _recognized = layer[1]
- for span in _recognized:
- _ids, _content, _left, _right = span[0], span[1], _map[span[3]], _map[span[4]]
- for _id in _ids:
- _attrs = _content[_id]
- for _attr_name in _attrs:
- for _attr_value in _attrs[_attr_name]:
- all_entries.append(tuple([_left, _right, _attr_name, _attr_value]))
+ for span in recognized:
+ _ids, _content, _left, _right = span[0], span[1], span[2], span[3]
+ for _id in _ids:
+ _attrs = _content[_id]
+ for _attr_name in _attrs:
+ for _attr_value in _attrs[_attr_name]:
+ all_entries.append(tuple([_left, _right, _attr_name, _attr_value]))
if len(all_entries) > 0:
all_entries = sorted(sorted(all_entries, key=lambda x: -x[1]), key=lambda x: x[0])
filtered_entries = [all_entries[0]]
From 0e9a84a43d61ed6c02f15d387daf0f3a0970dc5c Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Fri, 21 Aug 2020 02:16:25 -0400
Subject: [PATCH 035/116] adjusted UT for updated content
---
test/ut_utility.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 2d589a4..481cc61 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -172,7 +172,7 @@ def test_compile_model(self):
model.COMPRESSED_KEY: 1,
model.TOKENIZER_OPTION_KEY: 0,
model.WORD_SEPARATOR_KEY: ' ',
- model.CONTENT_KEY: {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}}, 't2': {'c': {'onflicting refrigerator': {model.ENTITY_KEY: [2, 8]}}, 'a': {'w': {'e': {'some refrigerators': {model.ENTITY_KEY: [3]}}, 'w': {'some refrigerator': {model.ENTITY_KEY: [5]}}}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}}
+ model.CONTENT_KEY: {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}, 'c': {'onflicting refrigerator': {model.ENTITY_KEY: [8]}}}, 't2': {'c': {'onflicting refrigerator': {model.ENTITY_KEY: [2]}}, 'a': {'w': {'e': {'some refrigerators': {model.ENTITY_KEY: [3]}}, 'w': {'some refrigerator': {model.ENTITY_KEY: [5]}}}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}}
}
]
expected_keywords = {model.CONTENT_KEY: {0: {'refrigera', 'refrigeratorx', 'tors', 'refrigerator', 'white', 'awesome', 'conflicting'}, 1: {'it', 'o', 'awwsome', 'white', 'refrigerator', 'refrigerator', 'conflicting', 'refrigerators', 'awesome'}}, model.INTERNAL_ID_KEY: {0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}}
From 5c8d96a175ae044f3e2fed89de8fbfedb65d25a0 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Fri, 21 Aug 2020 16:26:03 -0400
Subject: [PATCH 036/116] cleanup
---
pilsner/utility.py | 35 +----------------------------------
test/ut_utility.py | 35 ++++++-----------------------------
2 files changed, 7 insertions(+), 63 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index c6e1990..a134ebf 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -152,34 +152,6 @@ def compile_model(self, model, filename, specs, word_separator, column_separator
model[model.KEYWORDS_KEY] = keywords
return True
- def verify_keywords(self, model, recognized, src, word_separator):
- id_list = [set([model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] for x in rec[0] if x in model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY]]) for rec in recognized]
- for k in range(len(id_list)):
- ids = id_list[k]
- if len(ids) < 2:
- continue
- si = 0
- ei = len(src)
- if k > 0:
- si = recognized[k-1][4]
- if k < len(id_list) - 1:
- ei = recognized[k+1][3]
- tokens = src[si:ei]
- s_tokens = set(tokens.split(word_separator))
- tmp = {i: model[model.KEYWORDS_KEY][model.CONTENT_KEY][i] if i in model[model.KEYWORDS_KEY][model.CONTENT_KEY] else set() for i in ids}
- kwd = {i: tmp[i] - tmp[j] for i in tmp for j in tmp if j != i}
- winner_score = 0
- winner_id = set()
- kwd_score = {}
- for i in kwd:
- kwd_score[i] = len(kwd[i].intersection(s_tokens))
- if kwd_score[i] > winner_score:
- winner_score = kwd_score[i]
- winner_id.clear()
- if kwd_score[i] == winner_score:
- winner_id.add(i)
- recognized[k] = tuple([[x for x in recognized[k][0] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id]] + [{x: recognized[k][1][x] for x in recognized[k][1] if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] in winner_id}] + list(recognized[k])[2:5])
-
def unpack_trie(self, model, packed_trie, compressed):
"""TODO: add docstring here
"""
@@ -232,8 +204,6 @@ def attribute_unpacker(self, cur, leaf_ids, include_query, exclude_query, proces
def spot_entities(self, model, source_string, normalizer_name, include_query='', exclude_query='', process_exclude=False, attrs_out_query='', progress_from=0, progress_to=100):
# TODO: review for refactoring
- def blend_in(_ret, _rets):
- _rets += _ret
self.logger('Analyzing "%s"... ' % (source_string))
rets = []
this_progress_position = 0
@@ -322,11 +292,8 @@ def blend_in(_ret, _rets):
ret.append(shorter_alternative)
elif shorter_alternative:
ret.append(shorter_alternative)
- #rets += ret
- blend_in(ret, rets)
+ rets += ret
current_trie_index += 1
- #if model[model.KEYWORDS_KEY] is not None:
- # self.verify_keywords(model, rets, source_string, word_separator)
self.push_message(progress_to, self.callback_progress)
self.logger('Done.')
return rets
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 481cc61..5f9b85e 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -179,35 +179,6 @@ def test_compile_model(self):
assert model[model.DICTIONARY_KEY] == expected_dictionary, '\nExpected\n%s\nGot\n%s' % (str(expected_dictionary), str(model[model.DICTIONARY_KEY]))
assert model[model.KEYWORDS_KEY] == expected_keywords, '\nExpected\n%s\nGot\n%s' % (str(expected_keywords), str(model[model.KEYWORDS_KEY]))
- def test_verify_keywords(self):
- _, model = self.compile_test_model()
- src = 'awwsome conflicting refrigerator'
- recognized = [
- (
- [2, 8],
- {
- 2: {'entity_id': ['entity2'], 'normalizer': ['tokenizer2'], 'some_attribute': ['D', 'E']},
- 8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer2'], 'some_attribute': ['A', 'B', 'C']}
- },
- 'conflicting refrigerator',
- 8,
- 31
- )
- ]
- self.recognizer.verify_keywords(model=model, recognized=recognized, src=src, word_separator=' ')
- expected = [
- (
- [8],
- {
- 8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer2'], 'some_attribute': ['A', 'B', 'C']}
- },
- 'conflicting refrigerator',
- 8,
- 31
- )
- ]
- assert recognized == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(recognized))
-
def test_unpack_trie(self):
_, model = self.compile_test_model()
packed_trie = {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}
@@ -224,6 +195,12 @@ def test_attribute_unpacker(self):
def test_spot_entities(self):
pass
+ def test_disambiguate(self):
+ pass
+
+ def test_flatten_layers(self):
+ pass
+
def test_flatten_spans(self):
pass
From d559ab7470f9276ee03649db4ed1c07c4f16c684 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 21 Aug 2020 22:25:44 -0400
Subject: [PATCH 037/116] utility adjustments
---
pilsner/utility.py | 25 ++++++++++++-------------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index a134ebf..f9822c5 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -171,13 +171,7 @@ def unpack_trie(self, model, packed_trie, compressed):
unpacked_trie_pointer[radix[-1:]] = packed_trie[radix]
return unpacked_trie
- def check_attrs(self, model, trie_leaf, cur, specs, include_query, exclude_query, process_exclude, attrs_out_query):
- trie_leaf[model.ATTRS_KEY] = self.attribute_unpacker(cur, trie_leaf[model.ENTITY_KEY], include_query, exclude_query, process_exclude, attrs_out_query)
- if len(trie_leaf[model.ATTRS_KEY]) == 0:
- return {}
- return trie_leaf
-
- def attribute_unpacker(self, cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query):
+ def unpack_attributes(self, cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query):
attributes = {}
include = set()
exclude = set()
@@ -202,6 +196,12 @@ def attribute_unpacker(self, cur, leaf_ids, include_query, exclude_query, proces
attributes[n][attr_name].append(attr_value)
return attributes
+ def check_attrs(self, model, trie_leaf, cur, include_query, exclude_query, process_exclude, attrs_out_query):
+ trie_leaf[model.ATTRS_KEY] = self.unpack_attributes(cur, trie_leaf[model.ENTITY_KEY], include_query, exclude_query, process_exclude, attrs_out_query)
+ if len(trie_leaf[model.ATTRS_KEY]) == 0:
+ return {}
+ return trie_leaf
+
def spot_entities(self, model, source_string, normalizer_name, include_query='', exclude_query='', process_exclude=False, attrs_out_query='', progress_from=0, progress_to=100):
# TODO: review for refactoring
self.logger('Analyzing "%s"... ' % (source_string))
@@ -224,7 +224,6 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
temporary_index = -1
total_length = len(source_string)
increment_chars = int(total_length / progress_share) if total_length > progress_share else total_length - 1
- dictionary_specs = trie[model.SPECS_KEY]['fields'].keys()
while current_index < total_length:
this_progress_position = int(current_index / increment_chars / total_tries)
if this_progress_position != last_progress_position:
@@ -242,7 +241,7 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
end_index = current_index
character = source_string[current_index]
if character == word_separator and model.ENTITY_KEY in subtrie:
- found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
+ found_object = self.check_attrs(model, subtrie, model.cursor, include_query, exclude_query, process_exclude, attrs_out_query)
if found_object:
identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
shorter_alternative = (identified[0], identified[1], string_so_far, start_index + 1, end_index)
@@ -255,7 +254,7 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
#if everything_or_nothing and current_index == total_length: return []
if character == word_separator or current_index == total_length: # - 1:
if model.ENTITY_KEY in subtrie:
- found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
+ found_object = self.check_attrs(model, subtrie, model.cursor, include_query, exclude_query, process_exclude, attrs_out_query)
if found_object:
identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
ret.append((identified[0], identified[1], string_so_far, start_index + 1, end_index))
@@ -284,7 +283,7 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
subtrie = trie[model.CONTENT_KEY][normalizer_name]
current_index += 1
if model.ENTITY_KEY in subtrie:
- found_object = self.check_attrs(model, subtrie, model.cursor, dictionary_specs, include_query, exclude_query, process_exclude, attrs_out_query)
+ found_object = self.check_attrs(model, subtrie, model.cursor, include_query, exclude_query, process_exclude, attrs_out_query)
if found_object:
identified = found_object[model.ENTITY_KEY], found_object[model.ATTRS_KEY]
ret.append((identified[0], identified[1], string_so_far, start_index + 1, current_index - 1))
@@ -374,10 +373,10 @@ def flatten_layers(self, model, layers):
ret = [x[0:4] for x in new_layers]
return ret
- def flatten_spans(self, recognized):
+ def flatten_spans(self, spans):
ret = {}
all_entries = []
- for span in recognized:
+ for span in spans:
_ids, _content, _left, _right = span[0], span[1], span[2], span[3]
for _id in _ids:
_attrs = _content[_id]
From 92513120bcf0e61c4ce96f2979a15458913b638c Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 21 Aug 2020 22:25:53 -0400
Subject: [PATCH 038/116] unit tests (utility)
---
test/ut_utility.py | 171 ++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 161 insertions(+), 10 deletions(-)
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 5f9b85e..6d1493f 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -186,29 +186,180 @@ def test_unpack_trie(self):
unpacked_trie = self.recognizer.unpack_trie(model=model, packed_trie=packed_trie, compressed=True)
assert unpacked_trie == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(unpacked_trie))
- def test_check_attrs(self):
- pass
+ def test_unpack_attributes(self):
+ _, model = self.compile_test_model()
+ cur = model.cursor
+ leaf_ids = [8]
+ include_query = ''
+ exclude_query = ''
+ process_exclude = False
+ attrs_out_query = ''
+ expected = {8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer1'], 'some_attribute': ['A', 'B', 'C']}}
+ attributes = self.recognizer.unpack_attributes(cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query)
+ assert attributes == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(attributes))
- def test_attribute_unpacker(self):
- pass
+ def test_check_attrs(self):
+ _, model = self.compile_test_model()
+ trie_leaf = {model.ENTITY_KEY: [8]}
+ cur = model.cursor
+ include_query = ''
+ exclude_query = ''
+ process_exclude = False
+ attrs_out_query = ''
+ expected = {model.ENTITY_KEY: [8], model.ATTRS_KEY: {8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer1'], 'some_attribute': ['A', 'B', 'C']}}}
+ got_leaf = self.recognizer.check_attrs(model, trie_leaf, cur, include_query, exclude_query, process_exclude, attrs_out_query)
+ assert got_leaf == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(got_leaf))
def test_spot_entities(self):
- pass
+ _, model = self.compile_test_model()
+ source_string = 'this is awesome white refrigerator , and this is not'
+ normalizer_name = 't1'
+ expected = [([4], {4: {'entity_id': ['entity1'], 'normalizer': ['tokenizer1'], 'some_attribute': ['A', 'B', 'C']}}, 'awesome white refrigerator', 8, 34)]
+ spotted = self.recognizer.spot_entities(model, source_string, normalizer_name)
+ assert spotted == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(spotted))
def test_disambiguate(self):
- pass
+ _, model = self.compile_test_model()
+ # source string: this is awwsome and conflicting refrigerator, hey
+ # spotted span
+ spotted = [
+ (
+ [8, 2], # internal IDs
+ {
+ 8: {'DType': ['tokenizer1'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}, # attrs for each internal ID
+ 2: {'DType': ['tokenizer2'], 'MSID': ['entity2'], 'smth': ['D', 'E']} # attrs for each internal ID
+ },
+ 20, 44, # location (mapped)
+ [0, 1], # indexes of items in srcs (normalized source strings)
+ [
+ (20, 44), # location[0] (unmapped)
+ (20, 44) # location[1] (unmapped)
+ ]
+ )
+ ]
+ # normalized source strings
+ srcs = [
+ 'this is awwsome and conflicting refrigerator , hey',
+ 'this is awwsome and conflicting refrigerator , hey'
+ ]
+ word_separator = ' '
+ # given a sample model, we expect internal ID == 2 to be removed
+ expected = [
+ (
+ [8],
+ {
+ 8: {'DType': ['tokenizer1'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}
+ },
+ 20, 44,
+ [0, 1],
+ [
+ (20, 44),
+ (20, 44)
+ ]
+ )
+ ]
+ disambiguated = self.recognizer.disambiguate(model, spotted, srcs, word_separator)
+ assert disambiguated == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(disambiguated))
def test_flatten_layers(self):
- pass
+ _, model = self.compile_test_model()
+ # two normalization layers; first has one span; second has two spans
+ layers = [
+ (
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
+ [
+ (
+ [4],
+ {4: {'DType': ['tokenizer1'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}},
+ 'awesome white refrigerator',
+ 47, 72
+ )
+ ],
+ 'this is awwsome white refrigerator , and it is awesome white refrigerator'
+ ),
+ (
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
+ [
+ (
+ [5],
+ {5: {'DType': ['tokenizer2'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}},
+ 'awwsome refrigerator',
+ 8, 28
+ ),
+ (
+ [6],
+ {6: {'DType': ['tokenizer2'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}},
+ 'it',
+ 35, 37
+ )
+ ],
+ 'this is awwsome refrigerator , and it is awesome refrigerator'
+ )
+ ]
+ # we expect all three spans to get stacked in one list (without strings themselves, and with mapped locations)
+ expected = [
+ (
+ [5],
+ {5: {'DType': ['tokenizer2'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}},
+ 8, 34
+ ),
+ (
+ [6],
+ {6: {'DType': ['tokenizer2'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}},
+ 40, 42
+ ),
+ (
+ [4],
+ {4: {'DType': ['tokenizer1'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}},
+ 46, 71
+ )
+ ]
+ flattened = self.recognizer.flatten_layers(model, layers)
+ assert flattened == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(flattened))
def test_flatten_spans(self):
- pass
+ spans = [
+ (
+ [5],
+ {5: {'DType': ['tokenizer2'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}},
+ 8, 34
+ ),
+ (
+ [6],
+ {6: {'DType': ['tokenizer2'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}},
+ 40, 42
+ ),
+ (
+ [4],
+ {4: {'DType': ['tokenizer1'], 'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}},
+ 46, 71
+ )
+ ]
+ expected = {
+ (8, 34): {'DType': {'tokenizer2'}, 'MSID': {'entity1'}, 'smth': {'C', 'B', 'A'}},
+ (40, 42): {'DType': {'tokenizer2'}, 'MSID': {'entity1'}, 'smth': {'C', 'B', 'A'}},
+ (46, 71): {'DType': {'tokenizer1'}, 'MSID': {'entity1'}, 'smth': {'C', 'B', 'A'}}
+ }
+ flattened = self.recognizer.flatten_spans(spans)
+ assert flattened == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(flattened))
def test_reduce_spans(self):
- pass
+ segments = [tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])]
+ expected = [tuple([1, 6])]
+ reduced = self.recognizer.reduce_spans(segments)
+ assert reduced == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(reduced))
def test_parse(self):
- pass
+ _, model = self.compile_test_model()
+ source_string = 'this is awwsome white refrigerator o refrigerator, is it tors not conflicting refrigerator hey'
+ expected = {
+ (8, 34): {'entity_id': {'entity1'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'C', 'B', 'A'}},
+ (35, 36): {'entity_id': {'entity1'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'C', 'B', 'A'}},
+ (54, 56): {'entity_id': {'entity1'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'C', 'B', 'A'}},
+ (66, 90): {'entity_id': {'entity2'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'D', 'E'}}
+ }
+ output = self.recognizer.parse(model, source_string)
+ assert output == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(output))
if __name__ == '__main__':
unittest.main()
From 7d7cee3090dc6f2771e4de6f9495caefada0e9de Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 21 Aug 2020 22:26:05 -0400
Subject: [PATCH 039/116] chmod
---
scripts/linux/unittest.sh | 0
1 file changed, 0 insertions(+), 0 deletions(-)
mode change 100644 => 100755 scripts/linux/unittest.sh
diff --git a/scripts/linux/unittest.sh b/scripts/linux/unittest.sh
old mode 100644
new mode 100755
From 963721e962b56d540f2998ffda32f51c027600c7 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Sat, 22 Aug 2020 01:54:31 -0400
Subject: [PATCH 040/116] model to use default normalizer
---
pilsner/model.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/pilsner/model.py b/pilsner/model.py
index b41bcbd..c1acd8f 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -202,6 +202,8 @@ def get_dictionary_synonym(self, columns, specs, word_separator, tokenizer_optio
normalizer_name = self[self.DEFAULT_NORMALIZER_KEY]
elif columns[specs['tokenizer'][0]] in self.normalizer_map and self.normalizer_map[columns[specs['tokenizer'][0]]] in self[self.NORMALIZER_KEY]:
normalizer_name = self.normalizer_map[columns[specs['tokenizer'][0]]]
+ else:
+ normalizer_name = self[self.DEFAULT_NORMALIZER_KEY]
if normalizer_name is not None:
synonym = self[self.NORMALIZER_KEY][normalizer_name].normalize(synonym, word_separator, tokenizer_option)
return synonym, normalizer_name
From 9bed17a1a86426034f0ffbf8845f2758dd8369fe Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Sat, 22 Aug 2020 01:54:54 -0400
Subject: [PATCH 041/116] generate content for performance testing
---
test/performance.py | 77 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 77 insertions(+)
diff --git a/test/performance.py b/test/performance.py
index e69de29..2f888c8 100644
--- a/test/performance.py
+++ b/test/performance.py
@@ -0,0 +1,77 @@
+import sys; sys.path.insert(0, '')
+import random
+import string
+import timeit
+import pilsner # pylint: disable=E0611,F0401
+
+def random_label(min_words, max_words, min_word_length, max_word_length):
+ number_of_words = random.randint(min_words, max_words)
+ words = []
+ greek = ['qalphaz', 'qbetaz', 'qgammaz']
+ for _ in range(number_of_words):
+ words.append(''.join(random.choice(string.ascii_letters) for i in range(random.randint(min_word_length, max_word_length))))
+ words[0] += greek[random.randint(0, 2)]
+ label = ' '.join(words)
+ return label
+
+def random_label_list(min_labels, max_labels, min_words, max_words, min_word_length, max_word_length):
+ number_of_labels = random.randint(min_labels, max_labels)
+ labels = [random_label(min_words, max_words, min_word_length, max_word_length) for _ in range(number_of_labels)]
+ label_attrs = [','.join([random_label(1, 1, 3, 5) for _ in range(random.randint(1,3))]) for _ in range(number_of_labels)]
+ return labels, label_attrs
+
+def create_labels(dictionary_size, min_labels_per_entity, max_labels_per_entity, min_words_per_label, max_words_per_label, min_word_length, max_word_length):
+ labels = {}
+ label_attrs = {}
+ for i in range(dictionary_size):
+ labels[i+1], label_attrs[i+1] = random_label_list(min_labels_per_entity, max_labels_per_entity, min_words_per_label, max_words_per_label, min_word_length, max_word_length)
+ return labels, label_attrs
+
+def create_entity_attrs(dictionary_size):
+ attrs = {}
+ for i in range(dictionary_size):
+ attrs[i+1] = random_label(1, 1, 3, 5)
+ return attrs
+
+def create_test_dictionary(dictionary_size, min_labels_per_entity, max_labels_per_entity, min_words_per_label, max_words_per_label, min_word_length, max_word_length):
+ labels, label_attrs = create_labels(dictionary_size, min_labels_per_entity, max_labels_per_entity, min_words_per_label, max_words_per_label, min_word_length, max_word_length)
+ entity_attrs = create_entity_attrs(dictionary_size)
+ with open('.test-dict.txt', mode='w', encoding='utf8') as f:
+ for entity_id in entity_attrs:
+ for i in range(len(labels[entity_id])):
+ f.write('%d\t%s\t%s\t%s\n' % (entity_id, labels[entity_id][i], label_attrs[entity_id][i], entity_attrs[entity_id]))
+ selected_entities = [random.randint(1, dictionary_size) for _ in range(10)]
+ with open('.test-labels.txt', mode='w', encoding='utf8') as f:
+ for entity_id in selected_entities:
+ f.write('%s\n' % (labels[entity_id][random.randint(1, len(labels[entity_id]))-1]))
+
+def create_test_text(number_of_words):
+ labels = []
+ with open('.test-labels.txt', mode='r', encoding='utf8') as f:
+ for line in f:
+ labels.append(line.strip())
+ t = []
+ for _ in range(number_of_words):
+ t.append(random_label(1, 1, 3, 10))
+ for label in labels:
+ t.insert(random.randint(1, len(t))-1, label)
+ text = ' '.join(t)
+ with open('.test-text.txt', mode='w', encoding='utf8') as f:
+ f.write(text)
+
+create_test_dictionary(100000, 1, 5, 1, 4, 3, 10)
+create_test_text(100000)
+
+model = pilsner.Model()
+model.add_normalizer('standard', None)
+recognizer = pilsner.Recognizer()
+fields = [
+ {'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
+ {'name': 'label_attr', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'entity_attr', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
+]
+specs = recognizer.compile_dict_specs(fields)
+recognizer.compile_model(model, '.test-dict.txt', specs, ' ', '\t', '\n', include_keywords=True)
+
+#print(model)
From 228e9dcdba02d818d37d3c7beb8e06bea1d90ce7 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Sun, 23 Aug 2020 01:30:33 -0400
Subject: [PATCH 042/116] fixed bug (invalid indexing of token sets)
---
pilsner/utility.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index f9822c5..226e526 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -308,7 +308,7 @@ def disambiguate(self, model, recognized, srcs, word_separator):
src = []
ei = []
tokens = []
- s_tokens = []
+ s_tokens = {}
for j in range(len(ids)):
#si[j] = 0
si.append(0)
@@ -323,7 +323,8 @@ def disambiguate(self, model, recognized, srcs, word_separator):
#tokens[j] = src[j][si[j]:ei[j]]
tokens.append(src[j][si[j]:ei[j]])
#s_tokens[j] = set(tokens[j].split(word_separator))
- s_tokens.append(set(tokens[j].split(word_separator)))
+ #s_tokens.append(set(tokens[j].split(word_separator)))
+ s_tokens[ids[j]] = set(tokens[j].split(word_separator))
tmp = {i: model[model.KEYWORDS_KEY][model.CONTENT_KEY][i] if i in model[model.KEYWORDS_KEY][model.CONTENT_KEY] else set() for i in ids}
kwd = {i: tmp[i] - tmp[j] for i in tmp for j in tmp if j != i}
winner_score = 0
From 4d8507f739a8ff701fce625f73a7b422d83a6f06 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Sun, 23 Aug 2020 01:49:05 -0400
Subject: [PATCH 043/116] performance testing
---
test/performance.py | 189 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 177 insertions(+), 12 deletions(-)
diff --git a/test/performance.py b/test/performance.py
index 2f888c8..4ed8368 100644
--- a/test/performance.py
+++ b/test/performance.py
@@ -1,10 +1,25 @@
+import os
import sys; sys.path.insert(0, '')
import random
import string
import timeit
-import pilsner # pylint: disable=E0611,F0401
-def random_label(min_words, max_words, min_word_length, max_word_length):
+ENTITIES_IN_DICTIONARY = 50000
+WORDS_IN_TEXT = 100000
+
+MIN_LABELS_PER_ENTITY = 1
+MAX_LABELS_PER_ENTITY = 5
+MIN_WORDS_PER_LABEL = 1
+MAX_WORDS_PER_LABEL = 4
+MIN_WORD_LENGTH = 3
+MAX_WORD_LENGTH = 10
+
+def random_label(
+ min_words,
+ max_words,
+ min_word_length,
+ max_word_length
+):
number_of_words = random.randint(min_words, max_words)
words = []
greek = ['qalphaz', 'qbetaz', 'qgammaz']
@@ -14,27 +29,67 @@ def random_label(min_words, max_words, min_word_length, max_word_length):
label = ' '.join(words)
return label
-def random_label_list(min_labels, max_labels, min_words, max_words, min_word_length, max_word_length):
+def random_label_list(
+ min_labels,
+ max_labels,
+ min_words,
+ max_words,
+ min_word_length,
+ max_word_length
+):
number_of_labels = random.randint(min_labels, max_labels)
labels = [random_label(min_words, max_words, min_word_length, max_word_length) for _ in range(number_of_labels)]
label_attrs = [','.join([random_label(1, 1, 3, 5) for _ in range(random.randint(1,3))]) for _ in range(number_of_labels)]
return labels, label_attrs
-def create_labels(dictionary_size, min_labels_per_entity, max_labels_per_entity, min_words_per_label, max_words_per_label, min_word_length, max_word_length):
+def create_labels(
+ dictionary_size,
+ min_labels_per_entity,
+ max_labels_per_entity,
+ min_words_per_label,
+ max_words_per_label,
+ min_word_length,
+ max_word_length
+):
labels = {}
label_attrs = {}
for i in range(dictionary_size):
- labels[i+1], label_attrs[i+1] = random_label_list(min_labels_per_entity, max_labels_per_entity, min_words_per_label, max_words_per_label, min_word_length, max_word_length)
+ labels[i+1], label_attrs[i+1] = random_label_list(
+ min_labels=min_labels_per_entity,
+ max_labels=max_labels_per_entity,
+ min_words=min_words_per_label,
+ max_words=max_words_per_label,
+ min_word_length=min_word_length,
+ max_word_length=max_word_length
+ )
return labels, label_attrs
-def create_entity_attrs(dictionary_size):
+def create_entity_attrs(
+ dictionary_size
+):
attrs = {}
for i in range(dictionary_size):
attrs[i+1] = random_label(1, 1, 3, 5)
return attrs
-def create_test_dictionary(dictionary_size, min_labels_per_entity, max_labels_per_entity, min_words_per_label, max_words_per_label, min_word_length, max_word_length):
- labels, label_attrs = create_labels(dictionary_size, min_labels_per_entity, max_labels_per_entity, min_words_per_label, max_words_per_label, min_word_length, max_word_length)
+def create_test_dictionary(
+ dictionary_size,
+ min_labels_per_entity,
+ max_labels_per_entity,
+ min_words_per_label,
+ max_words_per_label,
+ min_word_length,
+ max_word_length
+):
+ labels, label_attrs = create_labels(
+ dictionary_size=dictionary_size,
+ min_labels_per_entity=min_labels_per_entity,
+ max_labels_per_entity=max_labels_per_entity,
+ min_words_per_label=min_words_per_label,
+ max_words_per_label=max_words_per_label,
+ min_word_length=min_word_length,
+ max_word_length=max_word_length
+ )
entity_attrs = create_entity_attrs(dictionary_size)
with open('.test-dict.txt', mode='w', encoding='utf8') as f:
for entity_id in entity_attrs:
@@ -59,9 +114,55 @@ def create_test_text(number_of_words):
with open('.test-text.txt', mode='w', encoding='utf8') as f:
f.write(text)
-create_test_dictionary(100000, 1, 5, 1, 4, 3, 10)
-create_test_text(100000)
+def create_test_dataset():
+ create_test_dictionary(
+ dictionary_size=ENTITIES_IN_DICTIONARY,
+ min_labels_per_entity=MIN_LABELS_PER_ENTITY,
+ max_labels_per_entity=MAX_LABELS_PER_ENTITY,
+ min_words_per_label=MIN_WORDS_PER_LABEL,
+ max_words_per_label=MAX_WORDS_PER_LABEL,
+ min_word_length=MIN_WORD_LENGTH,
+ max_word_length=MAX_WORD_LENGTH
+ )
+ create_test_text(
+ number_of_words=WORDS_IN_TEXT
+ )
+def perf_compile_model_save_model(modules_to_test):
+ n = 1
+ for x in modules_to_test:
+ print(
+ '%s: compiled model with %d entities from .test-dict.txt in %f seconds' % (
+ x,
+ ENTITIES_IN_DICTIONARY,
+ timeit.timeit(
+ setup = """
+import %s as pilsner
+model = pilsner.Model()
+model.add_normalizer('standard', None)
+recognizer = pilsner.Recognizer()
+fields = [
+ {'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
+ {'name': 'label_attr', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'entity_attr', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
+]
+specs = recognizer.compile_dict_specs(fields)
+""" % (x),
+ stmt="""
+recognizer.compile_model(model, '.test-dict.txt', specs, ' ', '\\t', '\\n', include_keywords=True)
+""",
+ number=n
+ )
+ )
+ )
+ print(
+ '%s: saved model with %d entities in %f seconds' % (
+ x,
+ ENTITIES_IN_DICTIONARY,
+ timeit.timeit(
+ setup = """
+import %s as pilsner
model = pilsner.Model()
model.add_normalizer('standard', None)
recognizer = pilsner.Recognizer()
@@ -72,6 +173,70 @@ def create_test_text(number_of_words):
{'name': 'entity_attr', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
specs = recognizer.compile_dict_specs(fields)
-recognizer.compile_model(model, '.test-dict.txt', specs, ' ', '\t', '\n', include_keywords=True)
+recognizer.compile_model(model, '.test-dict.txt', specs, ' ', '\\t', '\\n', include_keywords=True)
+""" % (x),
+ stmt="""
+model.save('.test-model')
+""",
+ number=n
+ )
+ )
+ )
+def perf_load_model_parse_test(modules_to_test):
+ n = 1
+ for x in modules_to_test:
+ print(
+ '%s: loaded model with %d entities from .test-model in %f seconds' % (
+ x,
+ ENTITIES_IN_DICTIONARY,
+ timeit.timeit(
+ setup = """
+import %s as pilsner
+model = pilsner.Model()
+""" % (x),
+ stmt="""
+model.load('.test-model')
+""",
+ number=n
+ )
+ )
+ )
+ print(
+ '%s: parsed text with %d words in .test-text.txt using model with %d entities in %f seconds' % (
+ x,
+ WORDS_IN_TEXT,
+ ENTITIES_IN_DICTIONARY,
+ timeit.timeit(
+ setup = """
+import %s as pilsner
+model = pilsner.Model()
+model.load('.test-model')
+recognizer = pilsner.Recognizer()
+with open('.test-text.txt', mode='r', encoding='utf8') as f:
+ test_text = f.read()
+""" % (x),
+ stmt="""
+found = recognizer.parse(model, test_text)
+""",
+ number=n
+ )
+ )
+ )
+
+def cleanup():
+ for filename in [
+ '.test-dict.txt',
+ '.test-labels.txt',
+ '.test-text.txt',
+ '.test-model.0.dictionary',
+ '.test-model.attributes',
+ '.test-model.keywords',
+ '.test-model.normalizers'
+ ]:
+ os.remove(filename)
-#print(model)
+if __name__ == '__main__':
+ create_test_dataset()
+ perf_compile_model_save_model(['pilsner'])
+ perf_load_model_parse_test(['pilsner'])
+ cleanup()
From b8aea02cded66f0edd2416cd940dc690137deee3 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Mon, 24 Aug 2020 00:45:15 -0400
Subject: [PATCH 044/116] cell_wall -> column_enclosure
---
pilsner/model.py | 4 ++--
test/ut_model.py | 4 ++--
test/ut_utility.py | 8 ++++----
3 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index c1acd8f..c626dc7 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -182,8 +182,8 @@ def store_attributes(self, line_number, internal_id, subtrie, specs, columns):
else:
_ = [self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, s)) for s in set(columns[specs['fields'][k][0]].split(specs['fields'][k][1]))]
- def get_dictionary_line(self, specs, entity_ids, line_numbers, line_number, line, column_separator, cell_wall):
- columns = [x.strip(cell_wall) for x in line.split(column_separator)]
+ def get_dictionary_line(self, specs, entity_ids, line_numbers, line_number, line, column_separator, column_enclosure):
+ columns = [x.strip(column_enclosure) for x in line.strip('\n').split(column_separator)]
if line_number in line_numbers:
internal_id = line_numbers[line_number]
else:
diff --git a/test/ut_model.py b/test/ut_model.py
index 7c82e68..7a4fcaf 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -124,8 +124,8 @@ def test_get_dictionary_line(self):
line_number = 1
line = 't1\tentity_id\tstring_value\tsome_attr,another_attr\n'
column_separator = '\t'
- cell_wall = '\n'
- got_line = self.model.get_dictionary_line(specs, entity_ids, line_numbers, line_number, line, column_separator, cell_wall)
+ column_enclosure = ''
+ got_line = self.model.get_dictionary_line(specs, entity_ids, line_numbers, line_number, line, column_separator, column_enclosure)
expected = (['t1', 'entity_id', 'string_value', 'some_attr,another_attr'], 0)
assert got_line == expected, 'Expected %s, got %s' % (str(expected), str(got_line))
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 6d1493f..8ca28e8 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -25,7 +25,7 @@ def compile_test_model(self):
'tokenizer1': 't1',
'tokenizer2': 't2'
}
- compiled = self.recognizer.compile_model(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', column_separator='\t', cell_wall='\n', include_keywords=True)
+ compiled = self.recognizer.compile_model(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', column_separator='\t', column_enclosure='', include_keywords=True)
return compiled, model
def test_init(self):
@@ -78,7 +78,7 @@ def test_make_recognizer(self):
]
specs = self.recognizer.compile_dict_specs(fields)
model = pilsner.Model()
- got_recognizer, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', cell_wall='\n', tokenizer_option=0)
+ got_recognizer, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', column_enclosure='', tokenizer_option=0)
expected_recognizer = [
{
model.SPECS_KEY: {
@@ -123,8 +123,8 @@ def test_make_keywords(self):
]
specs = self.recognizer.compile_dict_specs(fields)
model = pilsner.Model()
- _, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', cell_wall='\n', tokenizer_option=0)
- keywords = self.recognizer.make_keywords(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, line_numbers=got_line_numbers, word_separator=' ', disambiguate_all=False, column_separator='\t', cell_wall='\n', tokenizer_option=0)
+ _, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', column_enclosure='', tokenizer_option=0)
+ keywords = self.recognizer.make_keywords(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, line_numbers=got_line_numbers, word_separator=' ', disambiguate_all=False, column_separator='\t', column_enclosure='', tokenizer_option=0)
expected = {
model.CONTENT_KEY: {
0: {'refrigerator', 'white', 'awesome', 'conflicting', 'refrigerators', 'refrigeratorx'},
From b030e37b2605264a6d40f696c82d7a21000fbcef Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Mon, 24 Aug 2020 00:46:01 -0400
Subject: [PATCH 045/116] spotted bug in disambiguate() -> TODO
---
pilsner/utility.py | 27 +++++++++++++++++++--------
1 file changed, 19 insertions(+), 8 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 226e526..7a8a824 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -45,7 +45,7 @@ def compile_dict_specs(self, fields):
logging.debug('Done compiling specs')
return specs
- def make_recognizer(self, model, filename, specs, word_separator, item_limit, compressed, column_separator, cell_wall, tokenizer_option):
+ def make_recognizer(self, model, filename, specs, word_separator, item_limit, compressed, column_separator, column_enclosure, tokenizer_option):
# TODO: review for refactoring
self.logger('Making recognizer using %s' % (filename))
self.push_message('Making recognizer using %s' % (filename), self.callback_status)
@@ -77,7 +77,7 @@ def make_recognizer(self, model, filename, specs, word_separator, item_limit, co
trie = model.next_trie(specs, compressed, tokenizer_option, word_separator)
self.logger('Lines read: %d' % (line_count))
line_count = 0
- columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_number, line, column_separator, cell_wall)
+ columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_number, line, column_separator, column_enclosure)
synonym, normalizer_name = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
subtrie = trie[model.CONTENT_KEY][normalizer_name]
for character in synonym:
@@ -96,7 +96,7 @@ def make_recognizer(self, model, filename, specs, word_separator, item_limit, co
self.logger('Recognizer completed.')
return ret, line_numbers
- def make_keywords(self, model, filename, specs, line_numbers, word_separator, disambiguate_all, column_separator, cell_wall, tokenizer_option):
+ def make_keywords(self, model, filename, specs, line_numbers, word_separator, disambiguate_all, column_separator, column_enclosure, tokenizer_option):
self.logger('Making keywords using %s... ' % (filename))
self.push_message('Making keywords from {0}'.format(filename), self.callback_status)
total_bytes = os.path.getsize(filename) + 1
@@ -115,7 +115,7 @@ def make_keywords(self, model, filename, specs, line_numbers, word_separator, di
if this_progress_position != last_progress_position:
last_progress_position = this_progress_position
self.push_message(int(100 * chars_read / total_bytes), self.callback_progress)
- columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_count, line, column_separator, cell_wall)
+ columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_count, line, column_separator, column_enclosure)
internal_id_map[line_count] = internal_id
synonym, _ = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
if synonym not in synonyms:
@@ -132,7 +132,7 @@ def make_keywords(self, model, filename, specs, line_numbers, word_separator, di
with open(filename, mode='r', encoding='utf8') as f:
line_count = 0
for line in f:
- columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_count, line, column_separator, cell_wall)
+ columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_count, line, column_separator, column_enclosure)
if internal_id in overlapping_ids:
synonym, _ = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
tokens = synonym.split(word_separator)
@@ -143,11 +143,11 @@ def make_keywords(self, model, filename, specs, line_numbers, word_separator, di
self.logger('Done compiling keywords.')
return keywords
- def compile_model(self, model, filename, specs, word_separator, column_separator, cell_wall, compressed=True, item_limit=0, tokenizer_option=0, include_keywords=False, disambiguate_all=False):
- tries, line_numbers = self.make_recognizer(model, filename, specs, word_separator, item_limit, compressed, column_separator, cell_wall, tokenizer_option)
+ def compile_model(self, model, filename, specs, word_separator, column_separator, column_enclosure, compressed=True, item_limit=0, tokenizer_option=0, include_keywords=False, disambiguate_all=False):
+ tries, line_numbers = self.make_recognizer(model, filename, specs, word_separator, item_limit, compressed, column_separator, column_enclosure, tokenizer_option)
keywords = {model.CONTENT_KEY: {}, model.INTERNAL_ID_KEY: {}}
if include_keywords:
- keywords = self.make_keywords(model, filename, specs, line_numbers, word_separator, disambiguate_all, column_separator, cell_wall, tokenizer_option)
+ keywords = self.make_keywords(model, filename, specs, line_numbers, word_separator, disambiguate_all, column_separator, column_enclosure, tokenizer_option)
model[model.DICTIONARY_KEY] = tries
model[model.KEYWORDS_KEY] = keywords
return True
@@ -317,9 +317,20 @@ def disambiguate(self, model, recognized, srcs, word_separator):
#ei[j] = len(src[j])
ei.append(len(src[j]))
if k > 0:
+ # !!! TODO: rather than this, take map of normalizer [k-1] and remap location on map of normalizer[k] as a boundary
si[j] = _recognized[k-1][5][0][1]
+ # m = k - 1
+ # while m > 0 and _recognized[k][4][j] not in _recognized[m][4]:
+ # m -= 1
+ # if _recognized[k][4][j] in _recognized[k-1][4]:
+ # si[j] = _recognized[m][5][0][1]
if k < len(id_list) - 1:
ei[j] = _recognized[k+1][5][0][0]
+ # m = k + 1
+ # while m < len(id_list) - 1 and _recognized[k][4][j] not in _recognized[m][4]:
+ # m += 1
+ # if _recognized[k][4][j] in _recognized[m][4]:
+ # ei[j] = _recognized[m][5][0][0]
#tokens[j] = src[j][si[j]:ei[j]]
tokens.append(src[j][si[j]:ei[j]])
#s_tokens[j] = set(tokens[j].split(word_separator))
From 4403c706e56b785a6ad982e4b1d18d7a6c42c6e7 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Tue, 25 Aug 2020 01:40:03 -0400
Subject: [PATCH 046/116] think
---
pilsner/utility.py | 19 ++++++++++++-------
test/ut_utility.py | 6 +++++-
2 files changed, 17 insertions(+), 8 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 7a8a824..ddee46d 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -307,7 +307,7 @@ def disambiguate(self, model, recognized, srcs, word_separator):
si = []
src = []
ei = []
- tokens = []
+ tokens = {}
s_tokens = {}
for j in range(len(ids)):
#si[j] = 0
@@ -318,24 +318,27 @@ def disambiguate(self, model, recognized, srcs, word_separator):
ei.append(len(src[j]))
if k > 0:
# !!! TODO: rather than this, take map of normalizer [k-1] and remap location on map of normalizer[k] as a boundary
- si[j] = _recognized[k-1][5][0][1]
+ #si[j] = _recognized[k-1][5][0][1]
+ si[j] = _recognized[k][6][j][_recognized[k-1][3]]
# m = k - 1
# while m > 0 and _recognized[k][4][j] not in _recognized[m][4]:
# m -= 1
# if _recognized[k][4][j] in _recognized[k-1][4]:
# si[j] = _recognized[m][5][0][1]
if k < len(id_list) - 1:
- ei[j] = _recognized[k+1][5][0][0]
+ #ei[j] = _recognized[k+1][5][0][0]
+ ei[j] = _recognized[k][6][j][_recognized[k+1][2]]
# m = k + 1
# while m < len(id_list) - 1 and _recognized[k][4][j] not in _recognized[m][4]:
# m += 1
# if _recognized[k][4][j] in _recognized[m][4]:
# ei[j] = _recognized[m][5][0][0]
#tokens[j] = src[j][si[j]:ei[j]]
- tokens.append(src[j][si[j]:ei[j]])
+ #tokens.append(src[j][si[j]:ei[j]])
+ tokens[ids[j]] = src[j][si[j]:ei[j]]
#s_tokens[j] = set(tokens[j].split(word_separator))
#s_tokens.append(set(tokens[j].split(word_separator)))
- s_tokens[ids[j]] = set(tokens[j].split(word_separator))
+ s_tokens[ids[j]] = set(tokens[ids[j]].split(word_separator))
tmp = {i: model[model.KEYWORDS_KEY][model.CONTENT_KEY][i] if i in model[model.KEYWORDS_KEY][model.CONTENT_KEY] else set() for i in ids}
kwd = {i: tmp[i] - tmp[j] for i in tmp for j in tmp if j != i}
winner_score = 0
@@ -364,7 +367,7 @@ def flatten_layers(self, model, layers):
location = tuple([_map[span[3]], _map[span[4]]])
if location not in spans:
spans[location] = []
- spans[location].append(tuple([span[0], span[1], [i] * len(span[0]), span[3], span[4]]))
+ spans[location].append(tuple([span[0], span[1], [i] * len(span[0]), span[3], span[4], _map]))
new_layers = []
for location in spans:
new_left = location[0]
@@ -373,12 +376,14 @@ def flatten_layers(self, model, layers):
new_attrs = {}
new_srcids = []
new_locations = []
+ new_map = []
for item in spans[location]:
new_ids += item[0]
new_attrs = {**new_attrs, **item[1]}
new_srcids += item[2]
new_locations.append(tuple([item[3], item[4]]))
- new_layers.append(tuple([new_ids, new_attrs, new_left, new_right, new_srcids, new_locations]))
+ new_map.append(item[5])
+ new_layers.append(tuple([new_ids, new_attrs, new_left, new_right, new_srcids, new_locations, new_map]))
if model[model.KEYWORDS_KEY] is not None:
new_layers = self.disambiguate(model, new_layers, srcs, ' ')
pass
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 8ca28e8..a808497 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -362,4 +362,8 @@ def test_parse(self):
assert output == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(output))
if __name__ == '__main__':
- unittest.main()
+ #unittest.main()
+ x = TestUtility()
+ x.setUp()
+ x.compile_test_model()
+ x.test_parse()
From 6c716a00cbe983147ae79029d0fb891532e29338 Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Tue, 25 Aug 2020 02:41:25 -0400
Subject: [PATCH 047/116] think again
---
pilsner/utility.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index ddee46d..d30d7bb 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -319,7 +319,7 @@ def disambiguate(self, model, recognized, srcs, word_separator):
if k > 0:
# !!! TODO: rather than this, take map of normalizer [k-1] and remap location on map of normalizer[k] as a boundary
#si[j] = _recognized[k-1][5][0][1]
- si[j] = _recognized[k][6][j][_recognized[k-1][3]]
+ si[j] = _recognized[k][6][j][_recognized[k-1][5][0][1]]
# m = k - 1
# while m > 0 and _recognized[k][4][j] not in _recognized[m][4]:
# m -= 1
@@ -327,7 +327,7 @@ def disambiguate(self, model, recognized, srcs, word_separator):
# si[j] = _recognized[m][5][0][1]
if k < len(id_list) - 1:
#ei[j] = _recognized[k+1][5][0][0]
- ei[j] = _recognized[k][6][j][_recognized[k+1][2]]
+ ei[j] = _recognized[k][6][j][_recognized[k+1][4][0][1]]
# m = k + 1
# while m < len(id_list) - 1 and _recognized[k][4][j] not in _recognized[m][4]:
# m += 1
From eb939cd0908dbb5e69cb82ab46c32e19b376bdea Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Mon, 31 Aug 2020 01:24:17 -0400
Subject: [PATCH 048/116] unit tests, disambiguation (wip)
---
pilsner/utility.py | 37 +++++++++++++++++++++++--------
test/assets/sample_dictionary.txt | 2 +-
test/ut_utility.py | 22 +++++++++---------
3 files changed, 40 insertions(+), 21 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index d30d7bb..34aacd5 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -304,22 +304,32 @@ def disambiguate(self, model, recognized, srcs, word_separator):
ids = id_list[k]
if len(ids) < 2:
continue
- si = []
- src = []
- ei = []
+ si = {}
+ src = {}
+ ei = {}
tokens = {}
s_tokens = {}
for j in range(len(ids)):
#si[j] = 0
- si.append(0)
+ #si.append(0)
+ si[ids[j]] = 0
#src[j] = srcs[recognized[k][4][j]]
- src.append(srcs[_recognized[k][4][j]])
+ #src.append(srcs[_recognized[k][4][j]])
+ src[ids[j]] = srcs[_recognized[k][4][j]]
#ei[j] = len(src[j])
- ei.append(len(src[j]))
+ #ei.append(len(src[j]))
+ ei[ids[j]] = len(src[ids[j]])
+
if k > 0:
# !!! TODO: rather than this, take map of normalizer [k-1] and remap location on map of normalizer[k] as a boundary
#si[j] = _recognized[k-1][5][0][1]
- si[j] = _recognized[k][6][j][_recognized[k-1][5][0][1]]
+
+ #si[ids[j]] = _recognized[k][6][ _recognized[k][4][j] ][_recognized[k-1][5][0][1]]
+
+ # See above, think
+ if _recognized[k-1][5][0][1] > si[ids[j]]:
+ si[ids[j]] = _recognized[k-1][5][0][1]
+
# m = k - 1
# while m > 0 and _recognized[k][4][j] not in _recognized[m][4]:
# m -= 1
@@ -327,7 +337,11 @@ def disambiguate(self, model, recognized, srcs, word_separator):
# si[j] = _recognized[m][5][0][1]
if k < len(id_list) - 1:
#ei[j] = _recognized[k+1][5][0][0]
- ei[j] = _recognized[k][6][j][_recognized[k+1][4][0][1]]
+
+ #ei[ids[j]] = _recognized[k][6][ _recognized[k][4][j] ][_recognized[k+1][5][0][0]]
+
+ if _recognized[k+1][5][0][0] < ei[ids[j]]:
+ ei[ids[j]] = _recognized[k+1][5][0][0]
# m = k + 1
# while m < len(id_list) - 1 and _recognized[k][4][j] not in _recognized[m][4]:
# m += 1
@@ -335,10 +349,15 @@ def disambiguate(self, model, recognized, srcs, word_separator):
# ei[j] = _recognized[m][5][0][0]
#tokens[j] = src[j][si[j]:ei[j]]
#tokens.append(src[j][si[j]:ei[j]])
- tokens[ids[j]] = src[j][si[j]:ei[j]]
+
+ #tokens[ids[j]] = src[j][si[j]:ei[j]]
+ tokens[ids[j]] = src[ids[j]][si[ids[j]]:ei[ids[j]]]
+
#s_tokens[j] = set(tokens[j].split(word_separator))
#s_tokens.append(set(tokens[j].split(word_separator)))
s_tokens[ids[j]] = set(tokens[ids[j]].split(word_separator))
+ # tmp = {i: model[model.KEYWORDS_KEY][model.CONTENT_KEY][i] if i in model[model.KEYWORDS_KEY][model.CONTENT_KEY] else set() for i in ids}
+ # kwd = {i: tmp[i] - tmp[j] for i in tmp for j in tmp if j != i}
tmp = {i: model[model.KEYWORDS_KEY][model.CONTENT_KEY][i] if i in model[model.KEYWORDS_KEY][model.CONTENT_KEY] else set() for i in ids}
kwd = {i: tmp[i] - tmp[j] for i in tmp for j in tmp if j != i}
winner_score = 0
diff --git a/test/assets/sample_dictionary.txt b/test/assets/sample_dictionary.txt
index ee98b2a..4de7ac6 100644
--- a/test/assets/sample_dictionary.txt
+++ b/test/assets/sample_dictionary.txt
@@ -4,6 +4,6 @@ tokenizer2 entity2 conflicting refrigerator D,E
tokenizer2 entity1 awesome white refrigerators A,B,C
tokenizer1 entity1 awesome white refrigerator A,B,C
tokenizer2 entity1 awwsome white refrigerator A,B,C
-tokenizer2 entity1 it A,B,C
+tokenizer2 entity2 it A,B,C
tokenizer2 entity1 o A,B,C
tokenizer1 entity1 conflicting refrigerator A,B,C
diff --git a/test/ut_utility.py b/test/ut_utility.py
index a808497..851d93b 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -107,7 +107,7 @@ def test_make_recognizer(self):
3: 1,
4: 1,
5: 1,
- 6: 1,
+ 6: 0,
7: 1,
8: 1
}
@@ -127,8 +127,8 @@ def test_make_keywords(self):
keywords = self.recognizer.make_keywords(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, line_numbers=got_line_numbers, word_separator=' ', disambiguate_all=False, column_separator='\t', column_enclosure='', tokenizer_option=0)
expected = {
model.CONTENT_KEY: {
- 0: {'refrigerator', 'white', 'awesome', 'conflicting', 'refrigerators', 'refrigeratorx'},
- 1: {'o', 'white', 'conflicting', 'refrigerators', 'awwsome', 'it', 'refrigerator', 'awesome', 'refrigerator'}
+ 0: {'it', 'refrigeratorx', 'white', 'awesome', 'refrigerator', 'conflicting', 'refrigerators'},
+ 1: {'conflicting', 'white', 'awesome', 'refrigerator', 'o', 'refrigerators', 'awwsome'}
},
model.INTERNAL_ID_KEY: {
0: 0,
@@ -137,7 +137,7 @@ def test_make_keywords(self):
3: 1,
4: 1,
5: 1,
- 6: 1,
+ 6: 0,
7: 1,
8: 1
}
@@ -175,7 +175,7 @@ def test_compile_model(self):
model.CONTENT_KEY: {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}, 'c': {'onflicting refrigerator': {model.ENTITY_KEY: [8]}}}, 't2': {'c': {'onflicting refrigerator': {model.ENTITY_KEY: [2]}}, 'a': {'w': {'e': {'some refrigerators': {model.ENTITY_KEY: [3]}}, 'w': {'some refrigerator': {model.ENTITY_KEY: [5]}}}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}}
}
]
- expected_keywords = {model.CONTENT_KEY: {0: {'refrigera', 'refrigeratorx', 'tors', 'refrigerator', 'white', 'awesome', 'conflicting'}, 1: {'it', 'o', 'awwsome', 'white', 'refrigerator', 'refrigerator', 'conflicting', 'refrigerators', 'awesome'}}, model.INTERNAL_ID_KEY: {0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1}}
+ expected_keywords = {model.CONTENT_KEY: {0: {'refrigerator', 'tors', 'it', 'refrigera', 'white', 'conflicting', 'awesome', 'refrigeratorx'}, 1: {'refrigerator', 'refrigerators', 'white', 'o', 'conflicting', 'awwsome', 'awesome'}}, model.INTERNAL_ID_KEY: {0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 0, 7: 1, 8: 1}}
assert model[model.DICTIONARY_KEY] == expected_dictionary, '\nExpected\n%s\nGot\n%s' % (str(expected_dictionary), str(model[model.DICTIONARY_KEY]))
assert model[model.KEYWORDS_KEY] == expected_keywords, '\nExpected\n%s\nGot\n%s' % (str(expected_keywords), str(model[model.KEYWORDS_KEY]))
@@ -355,15 +355,15 @@ def test_parse(self):
expected = {
(8, 34): {'entity_id': {'entity1'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'C', 'B', 'A'}},
(35, 36): {'entity_id': {'entity1'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'C', 'B', 'A'}},
- (54, 56): {'entity_id': {'entity1'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'C', 'B', 'A'}},
+ (54, 56): {'entity_id': {'entity2'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'C', 'B', 'A'}},
(66, 90): {'entity_id': {'entity2'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'D', 'E'}}
}
output = self.recognizer.parse(model, source_string)
assert output == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(output))
if __name__ == '__main__':
- #unittest.main()
- x = TestUtility()
- x.setUp()
- x.compile_test_model()
- x.test_parse()
+ unittest.main()
+ #x = TestUtility()
+ #x.setUp()
+ #x.compile_test_model()
+ #x.test_parse()
From dbd926e5d1e3fdef6b4f77a5ce5de03165b1afbc Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 4 Sep 2020 00:57:50 -0400
Subject: [PATCH 049/116] removed unneccesary module
---
test/functional.py | 12 ------------
1 file changed, 12 deletions(-)
delete mode 100644 test/functional.py
diff --git a/test/functional.py b/test/functional.py
deleted file mode 100644
index c4911a1..0000000
--- a/test/functional.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import os
-import sys; sys.path.insert(0, '')
-import unittest
-import pilsner # pylint: disable=E0611,F0401
-
-class FunctionalTest(unittest.TestCase):
-
- def test_ad_hoc_load_model(self):
- pass
-
-if __name__ == '__main__':
- unittest.main()
From 9cd0be1f5b57870114a86d37f5d2a46987fe6114 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 4 Sep 2020 00:58:50 -0400
Subject: [PATCH 050/116] sandbox
---
test/sandbox.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/test/sandbox.py b/test/sandbox.py
index 54229bc..ec6845c 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -30,7 +30,8 @@ def save_it():
_messages.clear()
r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=3, include_keywords=True)
print(m['~keywords'])
- s = 'this is awwsome white refrigerator o refrigerator, is it tors not conflicting refrigerator hey'
+ #s = 'this is awwsome white refrigerator o refrigerator, is it tors not conflicting refrigerator hey'
+ s = 'this is awwsome white refrigerator , and it is awesome white refrigerator'
_messages.clear()
q = r.parse(m, s)
print(q)
From 680e74fed72a0b13b2b8ecc5d859d708f0684955 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 4 Sep 2020 02:39:09 -0400
Subject: [PATCH 051/116] disambiguation
---
pilsner/utility.py | 35 ++++++++++++++++++++++-------------
1 file changed, 22 insertions(+), 13 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 34aacd5..2f5ad96 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -327,8 +327,10 @@ def disambiguate(self, model, recognized, srcs, word_separator):
#si[ids[j]] = _recognized[k][6][ _recognized[k][4][j] ][_recognized[k-1][5][0][1]]
# See above, think
- if _recognized[k-1][5][0][1] > si[ids[j]]:
- si[ids[j]] = _recognized[k-1][5][0][1]
+ #if _recognized[k-1][5][0][1] > si[ids[j]]:
+ if _recognized[k][7][ids[j]][_recognized[k-1][3]][1] > si[ids[j]]:
+ #si[ids[j]] = _recognized[k-1][5][0][1]
+ si[ids[j]] = _recognized[k][7][ids[j]][_recognized[k-1][3]][1]
# m = k - 1
# while m > 0 and _recognized[k][4][j] not in _recognized[m][4]:
@@ -340,8 +342,10 @@ def disambiguate(self, model, recognized, srcs, word_separator):
#ei[ids[j]] = _recognized[k][6][ _recognized[k][4][j] ][_recognized[k+1][5][0][0]]
- if _recognized[k+1][5][0][0] < ei[ids[j]]:
- ei[ids[j]] = _recognized[k+1][5][0][0]
+ #if _recognized[k+1][5][0][0] < ei[ids[j]]:
+ if _recognized[k][7][ids[j]][_recognized[k+1][2]][0] < ei[ids[j]]:
+ #ei[ids[j]] = _recognized[k+1][5][0][0]
+ ei[ids[j]] = _recognized[k][7][ids[j]][_recognized[k+1][2]][0]
# m = k + 1
# while m < len(id_list) - 1 and _recognized[k][4][j] not in _recognized[m][4]:
# m += 1
@@ -378,7 +382,8 @@ def flatten_layers(self, model, layers):
srcs = []
for i in range(0, len(layers)):
layer = layers[i]
- _map = layer[0]
+ _map = layer[0][0]
+ _r_map = layer[0][1]
_recognized = layer[1]
_src = layer[2]
srcs.append(_src)
@@ -386,7 +391,7 @@ def flatten_layers(self, model, layers):
location = tuple([_map[span[3]], _map[span[4]]])
if location not in spans:
spans[location] = []
- spans[location].append(tuple([span[0], span[1], [i] * len(span[0]), span[3], span[4], _map]))
+ spans[location].append(tuple([span[0], span[1], [i] * len(span[0]), span[3], span[4], _map, _r_map]))
new_layers = []
for location in spans:
new_left = location[0]
@@ -395,15 +400,18 @@ def flatten_layers(self, model, layers):
new_attrs = {}
new_srcids = []
new_locations = []
- new_map = []
+ new_map = {}
+ new_r_map = {}
for item in spans[location]:
new_ids += item[0]
new_attrs = {**new_attrs, **item[1]}
- new_srcids += item[2]
- new_locations.append(tuple([item[3], item[4]]))
- new_map.append(item[5])
- new_layers.append(tuple([new_ids, new_attrs, new_left, new_right, new_srcids, new_locations, new_map]))
- if model[model.KEYWORDS_KEY] is not None:
+ if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY]:
+ new_srcids += item[2]
+ new_locations.append(tuple([item[3], item[4]]))
+ new_map.update({model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][k]: item[5] for k in item[0]})
+ new_r_map.update({model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][k]: item[6] for k in item[0]})
+ new_layers.append(tuple([new_ids, new_attrs, new_left, new_right, new_srcids, new_locations, new_map, new_r_map]))
+ if model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY]:
new_layers = self.disambiguate(model, new_layers, srcs, ' ')
pass
ret = [x[0:4] for x in new_layers]
@@ -495,10 +503,11 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
for normalizer_name in model[model.NORMALIZER_KEY]:
normalized_string = model[model.NORMALIZER_KEY][normalizer_name].normalize(source_string, model[model.WORD_SEPARATOR_KEY], model[model.TOKENIZER_OPTION_KEY])
character_map = model[model.NORMALIZER_KEY][normalizer_name].result['map']
+ r_character_map = model[model.NORMALIZER_KEY][normalizer_name].result['r_map']
progress_from = current_normalizer_index * spot_progress_share
progress_to = (current_normalizer_index + 1) * spot_progress_share
parsed = self.spot_entities(model, normalized_string, normalizer_name, include_query, exclude_query, process_exclude, attrs_out_query, progress_from=progress_from, progress_to=progress_to)
- rets.append((character_map, parsed, normalized_string))
+ rets.append(((character_map, r_character_map), parsed, normalized_string))
current_normalizer_index += 1
layers = self.flatten_layers(model, rets)
spans = self.flatten_spans(layers)
From f378cd8c0a8036d280404da498a91c6383a0668c Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 4 Sep 2020 02:39:37 -0400
Subject: [PATCH 052/116] updated unit tests
---
scripts/linux/unittest.sh | 2 +-
scripts/win/unittest.bat | 2 +-
test/assets/sample_dictionary.txt | 2 +-
test/sandbox.py | 12 ++++++------
test/ut_utility.py | 20 +++++++++++---------
5 files changed, 20 insertions(+), 18 deletions(-)
diff --git a/scripts/linux/unittest.sh b/scripts/linux/unittest.sh
index dd02fd2..ec23305 100755
--- a/scripts/linux/unittest.sh
+++ b/scripts/linux/unittest.sh
@@ -4,7 +4,7 @@ MYDIR=`pwd`
ROOT=${MYDIR}/../..
ENV=.env.36
TEST=${ROOT}/test
-FILES="ut_model.py ut_utility.py functional.py performance.py"
+FILES="ut_model.py ut_utility.py performance.py"
cd ${ROOT}
for FILE in ${FILES}
do
diff --git a/scripts/win/unittest.bat b/scripts/win/unittest.bat
index 4453ba7..ae32a49 100644
--- a/scripts/win/unittest.bat
+++ b/scripts/win/unittest.bat
@@ -3,7 +3,7 @@ set RUNDIR=%cd%
set ROOT=%~dp0..\..
set ENV=.env.37
set TEST=%ROOT%\test
-set FILES=ut_model.py ut_utility.py functional.py performance.py
+set FILES=ut_model.py ut_utility.py performance.py
cd %ROOT%
(for %%f in (%FILES%) do (
echo Running %%f
diff --git a/test/assets/sample_dictionary.txt b/test/assets/sample_dictionary.txt
index 4de7ac6..7eae706 100644
--- a/test/assets/sample_dictionary.txt
+++ b/test/assets/sample_dictionary.txt
@@ -6,4 +6,4 @@ tokenizer1 entity1 awesome white refrigerator A,B,C
tokenizer2 entity1 awwsome white refrigerator A,B,C
tokenizer2 entity2 it A,B,C
tokenizer2 entity1 o A,B,C
-tokenizer1 entity1 conflicting refrigerator A,B,C
+tokenizer2 entity1 conflicting refrigerator A,B,C
diff --git a/test/sandbox.py b/test/sandbox.py
index ec6845c..4d4a633 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -33,9 +33,9 @@ def save_it():
#s = 'this is awwsome white refrigerator o refrigerator, is it tors not conflicting refrigerator hey'
s = 'this is awwsome white refrigerator , and it is awesome white refrigerator'
_messages.clear()
- q = r.parse(m, s)
- print(q)
- #m.save('.test_model')
+ #q = r.parse(m, s)
+ #print(q)
+ m.save('.test_model')
def load_it():
rrr = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
@@ -43,11 +43,11 @@ def load_it():
s = 'this is awesome white refrigerators o refrigerator, is it not'
s *= 10
_messages.clear()
- q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
- print(q)
+ #q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
+ #print(q)
save_it()
-#load_it()
+load_it()
#segments = [tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])]
#r = Recognizer()
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 851d93b..e4b1455 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -172,7 +172,7 @@ def test_compile_model(self):
model.COMPRESSED_KEY: 1,
model.TOKENIZER_OPTION_KEY: 0,
model.WORD_SEPARATOR_KEY: ' ',
- model.CONTENT_KEY: {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}, 'c': {'onflicting refrigerator': {model.ENTITY_KEY: [8]}}}, 't2': {'c': {'onflicting refrigerator': {model.ENTITY_KEY: [2]}}, 'a': {'w': {'e': {'some refrigerators': {model.ENTITY_KEY: [3]}}, 'w': {'some refrigerator': {model.ENTITY_KEY: [5]}}}}, 'i': {'t': {model.ENTITY_KEY: [6]}}, 'o': {model.ENTITY_KEY: [7]}}}
+ model.CONTENT_KEY: {'t1': {'a': {'wesome white refrigera': {' ': {'tors': {'~i': [0]}}, 't': {'or': {'x': {'~i': [1]}, '~i': [4]}}}}}, 't2': {'c': {'onflicting refrigerator': {'~i': [2, 8]}}, 'a': {'w': {'e': {'some refrigerators': {'~i': [3]}}, 'w': {'some refrigerator': {'~i': [5]}}}}, 'i': {'t': {'~i': [6]}}, 'o': {'~i': [7]}}}
}
]
expected_keywords = {model.CONTENT_KEY: {0: {'refrigerator', 'tors', 'it', 'refrigera', 'white', 'conflicting', 'awesome', 'refrigeratorx'}, 1: {'refrigerator', 'refrigerators', 'white', 'o', 'conflicting', 'awwsome', 'awesome'}}, model.INTERNAL_ID_KEY: {0: 0, 1: 0, 2: 0, 3: 1, 4: 1, 5: 1, 6: 0, 7: 1, 8: 1}}
@@ -194,7 +194,7 @@ def test_unpack_attributes(self):
exclude_query = ''
process_exclude = False
attrs_out_query = ''
- expected = {8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer1'], 'some_attribute': ['A', 'B', 'C']}}
+ expected = {8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer2'], 'some_attribute': ['A', 'B', 'C']}}
attributes = self.recognizer.unpack_attributes(cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query)
assert attributes == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(attributes))
@@ -206,7 +206,7 @@ def test_check_attrs(self):
exclude_query = ''
process_exclude = False
attrs_out_query = ''
- expected = {model.ENTITY_KEY: [8], model.ATTRS_KEY: {8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer1'], 'some_attribute': ['A', 'B', 'C']}}}
+ expected = {model.ENTITY_KEY: [8], model.ATTRS_KEY: {8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer2'], 'some_attribute': ['A', 'B', 'C']}}}
got_leaf = self.recognizer.check_attrs(model, trie_leaf, cur, include_query, exclude_query, process_exclude, attrs_out_query)
assert got_leaf == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(got_leaf))
@@ -266,7 +266,10 @@ def test_flatten_layers(self):
# two normalization layers; first has one span; second has two spans
layers = [
(
- [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
+ (
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
+ [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13], [14, 14], [15, 15], [16, 16], [17, 17], [18, 18], [19, 19], [20, 20], [21, 21], [22, 22], [23, 23], [24, 24], [25, 25], [26, 26], [27, 27], [28, 28], [29, 29], [30, 30], [31, 31], [32, 32], [33, 33], [34, 34], [35, 35], [36, 36], [37, 37], [38, 38], [39, 39], [40, 40], [41, 41], [42, 42], [43, 43], [44, 44], [45, 45], [46, 46], [47, 47], [48, 48], [49, 49], [50, 50], [51, 51], [52, 52], [53, 53], [54, 54], [55, 55], [56, 56], [57, 57], [58, 58], [59, 59], [60, 60], [61, 61], [62, 62], [63, 63], [64, 64], [65, 65], [66, 66], [67, 67], [68, 68], [69, 69], [70, 70], [71, 71], [72, 72]]
+ ),
[
(
[4],
@@ -278,7 +281,10 @@ def test_flatten_layers(self):
'this is awwsome white refrigerator , and it is awesome white refrigerator'
),
(
- [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
+ (
+ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
+ [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13], [14, 14], [14, 14], [14, 14], [14, 14], [14, 14], [14, 14], [14, 14], [15, 15], [16, 16], [17, 17], [18, 18], [19, 19], [20, 20], [21, 21], [22, 22], [23, 23], [24, 24], [25, 25], [26, 26], [27, 27], [28, 28], [29, 29], [30, 30], [31, 31], [32, 32], [33, 33], [34, 34], [35, 35], [36, 36], [37, 37], [38, 38], [39, 39], [40, 40], [41, 41], [42, 42], [43, 43], [44, 44], [45, 45], [46, 46], [47, 47], [47, 47], [47, 47], [47, 47], [47, 47], [47, 47], [47, 47], [48, 48], [49, 49], [50, 50], [51, 51], [52, 52], [53, 53], [54, 54], [55, 55], [56, 56], [57, 57], [58, 58], [59, 59], [60, 60]]
+ ),
[
(
[5],
@@ -363,7 +369,3 @@ def test_parse(self):
if __name__ == '__main__':
unittest.main()
- #x = TestUtility()
- #x.setUp()
- #x.compile_test_model()
- #x.test_parse()
From e3a08d0a44f30d867a056a3b5f7b8fb5ba70f964 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 5 Sep 2020 23:13:28 -0400
Subject: [PATCH 053/116] added requirements
---
requirements.txt | 1 +
1 file changed, 1 insertion(+)
create mode 100644 requirements.txt
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c8c5d07
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+sic>=1.0.4 --no-binary sic
From ea3058c0079f28e93deb0ab25ee98399153b6e72 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Mon, 7 Sep 2020 01:07:45 -0400
Subject: [PATCH 054/116] renamed scripts
---
scripts/linux/{unittest.sh => testing.sh} | 0
scripts/win/{unittest.bat => testing.bat} | 0
2 files changed, 0 insertions(+), 0 deletions(-)
rename scripts/linux/{unittest.sh => testing.sh} (100%)
mode change 100755 => 100644
rename scripts/win/{unittest.bat => testing.bat} (100%)
diff --git a/scripts/linux/unittest.sh b/scripts/linux/testing.sh
old mode 100755
new mode 100644
similarity index 100%
rename from scripts/linux/unittest.sh
rename to scripts/linux/testing.sh
diff --git a/scripts/win/unittest.bat b/scripts/win/testing.bat
similarity index 100%
rename from scripts/win/unittest.bat
rename to scripts/win/testing.bat
From df38633d1d2a966c1aa05de6174f773af66de6b3 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Wed, 9 Sep 2020 20:31:31 -0400
Subject: [PATCH 055/116] chmod
---
scripts/linux/testing.sh | 0
1 file changed, 0 insertions(+), 0 deletions(-)
mode change 100644 => 100755 scripts/linux/testing.sh
diff --git a/scripts/linux/testing.sh b/scripts/linux/testing.sh
old mode 100644
new mode 100755
From f357bbf722a20699fee4bde7e396d27fcb4b2a0b Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 10 Sep 2020 22:57:27 -0400
Subject: [PATCH 056/116] clean up
---
pilsner/utility.py | 45 +++------------------------------------------
1 file changed, 3 insertions(+), 42 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 2f5ad96..3858cc0 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -138,7 +138,7 @@ def make_keywords(self, model, filename, specs, line_numbers, word_separator, di
tokens = synonym.split(word_separator)
overlapping_ids[internal_id] = overlapping_ids[internal_id].union(set(tokens))
line_count += 1
- # TODO: only leave tokens unique for a given internal_id
+ # TODO: only leave tokens unique for a given internal_id (?)
keywords = {model.CONTENT_KEY: overlapping_ids, model.INTERNAL_ID_KEY: internal_id_map}
self.logger('Done compiling keywords.')
return keywords
@@ -310,58 +310,19 @@ def disambiguate(self, model, recognized, srcs, word_separator):
tokens = {}
s_tokens = {}
for j in range(len(ids)):
- #si[j] = 0
- #si.append(0)
si[ids[j]] = 0
- #src[j] = srcs[recognized[k][4][j]]
- #src.append(srcs[_recognized[k][4][j]])
src[ids[j]] = srcs[_recognized[k][4][j]]
- #ei[j] = len(src[j])
- #ei.append(len(src[j]))
ei[ids[j]] = len(src[ids[j]])
-
if k > 0:
- # !!! TODO: rather than this, take map of normalizer [k-1] and remap location on map of normalizer[k] as a boundary
- #si[j] = _recognized[k-1][5][0][1]
-
- #si[ids[j]] = _recognized[k][6][ _recognized[k][4][j] ][_recognized[k-1][5][0][1]]
-
- # See above, think
- #if _recognized[k-1][5][0][1] > si[ids[j]]:
+ # take map from normalizer [k-1] and remap location on map of normalizer[k] as a boundary
if _recognized[k][7][ids[j]][_recognized[k-1][3]][1] > si[ids[j]]:
- #si[ids[j]] = _recognized[k-1][5][0][1]
si[ids[j]] = _recognized[k][7][ids[j]][_recognized[k-1][3]][1]
-
- # m = k - 1
- # while m > 0 and _recognized[k][4][j] not in _recognized[m][4]:
- # m -= 1
- # if _recognized[k][4][j] in _recognized[k-1][4]:
- # si[j] = _recognized[m][5][0][1]
if k < len(id_list) - 1:
- #ei[j] = _recognized[k+1][5][0][0]
-
- #ei[ids[j]] = _recognized[k][6][ _recognized[k][4][j] ][_recognized[k+1][5][0][0]]
-
- #if _recognized[k+1][5][0][0] < ei[ids[j]]:
+ # take map from normalizer [k+1] and remap location on map of normalizer[k] as a boundary
if _recognized[k][7][ids[j]][_recognized[k+1][2]][0] < ei[ids[j]]:
- #ei[ids[j]] = _recognized[k+1][5][0][0]
ei[ids[j]] = _recognized[k][7][ids[j]][_recognized[k+1][2]][0]
- # m = k + 1
- # while m < len(id_list) - 1 and _recognized[k][4][j] not in _recognized[m][4]:
- # m += 1
- # if _recognized[k][4][j] in _recognized[m][4]:
- # ei[j] = _recognized[m][5][0][0]
- #tokens[j] = src[j][si[j]:ei[j]]
- #tokens.append(src[j][si[j]:ei[j]])
-
- #tokens[ids[j]] = src[j][si[j]:ei[j]]
tokens[ids[j]] = src[ids[j]][si[ids[j]]:ei[ids[j]]]
-
- #s_tokens[j] = set(tokens[j].split(word_separator))
- #s_tokens.append(set(tokens[j].split(word_separator)))
s_tokens[ids[j]] = set(tokens[ids[j]].split(word_separator))
- # tmp = {i: model[model.KEYWORDS_KEY][model.CONTENT_KEY][i] if i in model[model.KEYWORDS_KEY][model.CONTENT_KEY] else set() for i in ids}
- # kwd = {i: tmp[i] - tmp[j] for i in tmp for j in tmp if j != i}
tmp = {i: model[model.KEYWORDS_KEY][model.CONTENT_KEY][i] if i in model[model.KEYWORDS_KEY][model.CONTENT_KEY] else set() for i in ids}
kwd = {i: tmp[i] - tmp[j] for i in tmp for j in tmp if j != i}
winner_score = 0
From ed91278f5dd640e57478cb3bae9bd6d89be76bc1 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 12 Sep 2020 00:19:45 -0400
Subject: [PATCH 057/116] think
---
pilsner/model.pxd | 17 +++++++++++++++++
scripts/win/makepyd.bat | 14 ++++++++++++++
test/compile.py | 8 ++++++++
3 files changed, 39 insertions(+)
create mode 100644 pilsner/model.pxd
create mode 100644 scripts/win/makepyd.bat
create mode 100644 test/compile.py
diff --git a/pilsner/model.pxd b/pilsner/model.pxd
new file mode 100644
index 0000000..f1052f5
--- /dev/null
+++ b/pilsner/model.pxd
@@ -0,0 +1,17 @@
+import cython
+
+cdef class Model(dict):
+
+ @cython.locals(
+ normalizers=cython.dict,
+ dictionary_number=cython.int
+ )
+ cpdef bint save(self, str filename)
+
+ @cython.locals(
+ normalizers=cython.dict,
+ _filename=cython.str,
+ dictionary=cython.dict,
+ keywords=cython.dict
+ )
+ cpdef load(self, str filename)
diff --git a/scripts/win/makepyd.bat b/scripts/win/makepyd.bat
new file mode 100644
index 0000000..375365f
--- /dev/null
+++ b/scripts/win/makepyd.bat
@@ -0,0 +1,14 @@
+@echo off
+set RUNDIR=%cd%
+set ROOT=%~dp0..\..
+set ENV=.env.37
+set SRC=%ROOT%\pilsner
+set DIST=%ROOT%\pyd
+set TEST=%ROOT%\test
+cd %ROOT%
+rmdir /S /Q %ROOT%\build
+rmdir /S /Q %ROOT%\cythonized
+if not exist %DIST%\nul mkdir %DIST%
+call %ROOT%\%ENV%\Scripts\python %TEST%\compile.py build_ext --inplace
+move /Y %SRC%\*.pyd %DIST%\
+cd %RUNDIR%
diff --git a/test/compile.py b/test/compile.py
new file mode 100644
index 0000000..874dc19
--- /dev/null
+++ b/test/compile.py
@@ -0,0 +1,8 @@
+from distutils.core import setup
+from Cython.Build import cythonize
+
+src = [
+ 'pilsner/model.py'
+]
+
+setup(ext_modules=cythonize(src, compiler_directives={'language_level': '3'}, build_dir='cythonized'))
From 4f0f8f8b2dd71275c63a6d5714dd915f5ddcca59 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 12 Sep 2020 01:25:40 -0400
Subject: [PATCH 058/116] model (pxd)
---
pilsner/model.pxd | 71 ++++++++++++++++++++++++++++++++++++++++++++++-
pilsner/model.py | 46 +++++++++++++++---------------
2 files changed, 94 insertions(+), 23 deletions(-)
diff --git a/pilsner/model.pxd b/pilsner/model.pxd
index f1052f5..8f2e45c 100644
--- a/pilsner/model.pxd
+++ b/pilsner/model.pxd
@@ -2,6 +2,29 @@ import cython
cdef class Model(dict):
+ cdef public str CONTENT_KEY
+ cdef public str SPECS_KEY
+ cdef public str COMPRESSED_KEY
+ cdef public str TOKENIZER_OPTION_KEY
+ cdef public str WORD_SEPARATOR_KEY
+ cdef public str ENTITY_KEY
+ cdef public str ATTRS_KEY
+ cdef public str INTERNAL_ID_KEY
+ cdef public str DICTIONARY_KEY
+ cdef public str KEYWORDS_KEY
+ cdef public str NORMALIZER_KEY
+ cdef public str DEFAULT_NORMALIZER_KEY
+ cdef public str DATASOURCE_KEY
+ cdef public str DEFAULT_DATASOURCE_PATH
+ cdef public str DEFAULT_DATASOURCE_FILENAME
+ cdef public str DEFAULT_DATASOURCE
+ cdef public str DEFAULT_WORD_SEPARATOR
+ cdef public int DEFAULT_TOKENIZER_OPTION
+ cdef public connection
+ cdef public cursor
+ cdef public normalizer_map
+ cdef public sic_builder
+
@cython.locals(
normalizers=cython.dict,
dictionary_number=cython.int
@@ -14,4 +37,50 @@ cdef class Model(dict):
dictionary=cython.dict,
keywords=cython.dict
)
- cpdef load(self, str filename)
+ cpdef bint load(self, str filename)
+
+ cpdef bint add_normalizer(self, str normalizer_name, str filename, bint default=*)
+
+ cpdef bint create_recognizer_schema(self, cursor)
+
+ @cython.locals(
+ children=cython.dict,
+ child_count=cython.int,
+ key=cython.str,
+ child=cython.dict,
+ next_prefix=cython.str,
+ comp_child=cython.dict,
+ comp_key=cython.str,
+ comp_children=cython.dict
+ )
+ cpdef tuple pack_subtrie(self, dict trie, bint compressed, str prefix)
+
+ @cython.locals(
+ ret=cython.dict,
+ normalizer_name=cython.str,
+ packed=cython.dict
+ )
+ cpdef dict pack_trie(self, dict trie, bint compressed)
+
+ @cython.locals(
+ k=cython.str
+ )
+ cpdef store_attributes(self, int line_number, int internal_id, dict subtrie, dict specs, list columns)
+
+ @cython.locals(
+ columns=cython.list,
+ internal_id=cython.int,
+ entity_id=cython.str
+ )
+ cpdef tuple get_dictionary_line(self, dict specs, dict entity_ids, dict line_numbers, int line_number, str line, str column_separator, str column_enclosure)
+
+ @cython.locals(
+ synonym=cython.str,
+ normalizer_name=cython.str
+ )
+ cpdef tuple get_dictionary_synonym(self, list columns, dict specs, str word_separator, int tokenizer_option=*)
+
+ @cython.locals(
+ new_trie=cython.dict
+ )
+ cpdef dict next_trie(self, dict specs, bint compressed, int tokenizer_option, str word_separator)
diff --git a/pilsner/model.py b/pilsner/model.py
index c626dc7..4ae5f5a 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -9,28 +9,29 @@
class Model(dict):
- CONTENT_KEY = '~content'
- SPECS_KEY = '~specs'
- COMPRESSED_KEY = '~compressed'
- TOKENIZER_OPTION_KEY = '~tokenizer_option'
- WORD_SEPARATOR_KEY = '~word_separator'
- ENTITY_KEY = '~i'
- ATTRS_KEY = '~p'
- INTERNAL_ID_KEY = '~iid'
- DICTIONARY_KEY = '~dictionary'
- KEYWORDS_KEY = '~keywords'
- NORMALIZER_KEY = '~normalization'
- DEFAULT_NORMALIZER_KEY = '~default_normalizer'
- DATASOURCE_KEY = '~datasource'
-
- DEFAULT_DATASOURCE_PATH = '.'
- DEFAULT_DATASOURCE_FILENAME = ''
- DEFAULT_DATASOURCE = ''
-
- DEFAULT_WORD_SEPARATOR = ' '
- DEFAULT_TOKENIZER_OPTION = 0
-
def __init__(self, filename='', storage_location='', debug_mode=False, verbose_mode=False):
+
+ self.CONTENT_KEY = '~content'
+ self.SPECS_KEY = '~specs'
+ self.COMPRESSED_KEY = '~compressed'
+ self.TOKENIZER_OPTION_KEY = '~tokenizer_option'
+ self.WORD_SEPARATOR_KEY = '~word_separator'
+ self.ENTITY_KEY = '~i'
+ self.ATTRS_KEY = '~p'
+ self.INTERNAL_ID_KEY = '~iid'
+ self.DICTIONARY_KEY = '~dictionary'
+ self.KEYWORDS_KEY = '~keywords'
+ self.NORMALIZER_KEY = '~normalization'
+ self.DEFAULT_NORMALIZER_KEY = '~default_normalizer'
+ self.DATASOURCE_KEY = '~datasource'
+
+ self.DEFAULT_DATASOURCE_PATH = '.'
+ self.DEFAULT_DATASOURCE_FILENAME = ''
+ self.DEFAULT_DATASOURCE = ''
+
+ self.DEFAULT_WORD_SEPARATOR = ' '
+ self.DEFAULT_TOKENIZER_OPTION = 0
+
self.DEFAULT_DATASOURCE_FILENAME = storage_location
if self.DEFAULT_DATASOURCE_FILENAME.lower() != ':memory:':
while self.DEFAULT_DATASOURCE_FILENAME == '' or os.path.exists(self.DEFAULT_DATASOURCE):
@@ -129,6 +130,7 @@ def create_recognizer_schema(self, cursor):
logging.debug('Creating schema for permanent storage')
cursor.execute('create table attrs (n integer, iid integer, attr_name text, attr_value text);')
logging.debug('Created schema for permanent storage')
+ return True
def pack_subtrie(self, trie, compressed, prefix):
if not compressed:
@@ -138,7 +140,7 @@ def pack_subtrie(self, trie, compressed, prefix):
if prefix == self.ENTITY_KEY:
return trie, prefix
children = trie
- child_count = len(children)
+ child_count = int(len(children))
if child_count == 1:
for key, child in children.items():
if key == self.ENTITY_KEY:
From d65fce0e9e0167e8fb8fb88fa8c338a10cf3613c Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 15 Sep 2020 23:52:04 -0400
Subject: [PATCH 059/116] implemented insert_node, remove_node methods
---
pilsner/utility.py | 38 +++++++++++++++++++++++++++++++++-----
test/ut_utility.py | 23 +++++++++++++++++++++++
2 files changed, 56 insertions(+), 5 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 3858cc0..6fb150b 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -45,6 +45,38 @@ def compile_dict_specs(self, fields):
logging.debug('Done compiling specs')
return specs
+ def insert_node(self, label, label_id, entity_id, subtrie, specs, columns, model):
+ for character in label:
+ if character not in subtrie:
+ subtrie[character] = {}
+ subtrie = subtrie[character]
+ model.store_attributes(label_id, entity_id, subtrie, specs, columns)
+
+ def remove_node(self, model, label, subtrie, prev=0):
+ # checks that we haven't hit the end of the word
+ if label:
+ first, rest = label[0], label[1:]
+ current_length = len(subtrie)
+ next_length, boo = self.remove_node(model, rest, subtrie=subtrie[first], prev=current_length)
+
+ # this statement avoids trimming excessively if the input is a prefix because
+ # if the word is a prefix, the first returned value will be greater than 1
+ if boo and next_length > 1:
+ boo = False
+
+ # this statement checks for the first occurrence of the current dict having more than one child
+ # or it checks that we've hit the bottom without trimming anything
+ elif boo and (current_length > 1 or not prev):
+ del subtrie[first]
+ boo = False
+
+ return current_length, boo
+
+ # when we do hit the end of the word, delete _end
+ else:
+ del subtrie[model.ENTITY_KEY]
+ return len(subtrie) + 1, True
+
def make_recognizer(self, model, filename, specs, word_separator, item_limit, compressed, column_separator, column_enclosure, tokenizer_option):
# TODO: review for refactoring
self.logger('Making recognizer using %s' % (filename))
@@ -80,11 +112,7 @@ def make_recognizer(self, model, filename, specs, word_separator, item_limit, co
columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_number, line, column_separator, column_enclosure)
synonym, normalizer_name = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
subtrie = trie[model.CONTENT_KEY][normalizer_name]
- for character in synonym:
- if character not in subtrie:
- subtrie[character] = {}
- subtrie = subtrie[character]
- model.store_attributes(line_number, internal_id, subtrie, specs, columns)
+ self.insert_node(synonym, line_number, internal_id, subtrie, specs, columns, model)
line_count += 1
line_number += 1
if line_count > 0 and len(trie) > 3:
diff --git a/test/ut_utility.py b/test/ut_utility.py
index e4b1455..45feb6e 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -69,6 +69,29 @@ def test_compile_dict_specs(self):
}
assert specs == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(specs))
+ def test_insert_node(self):
+ fields = [
+ {'name': 'normalizer', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
+ {'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
+ {'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
+ {'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
+ ]
+ specs = self.recognizer.compile_dict_specs(fields)
+ model = pilsner.Model()
+ model.create_recognizer_schema(model.cursor)
+ test_trie = {}
+ self.recognizer.insert_node(label='the synonym', label_id=1, entity_id='10', subtrie=test_trie, specs=specs, columns=['', '', '', ''], model=model)
+ self.recognizer.insert_node('the synthesis', 2, '20', test_trie, specs, ['', '', '', ''], model)
+ expected = {'t': {'h': {'e': {' ': {'s': {'y': {'n': {'o': {'n': {'y': {'m': {'~i': [1]}}}}, 't': {'h': {'e': {'s': {'i': {'s': {'~i': [2]}}}}}}}}}}}}}}
+ assert test_trie == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(test_trie))
+
+ def test_remove_node(self):
+ test_trie = {'t': {'h': {'e': {' ': {'s': {'y': {'n': {'o': {'n': {'y': {'m': {'~i': [1]}}}}, 't': {'h': {'e': {'s': {'i': {'s': {'~i': [1]}}}}}}}}}}}}}}
+ model = pilsner.Model()
+ self.recognizer.remove_node(model=model, label='the synonym', subtrie=test_trie)
+ expected = {'t': {'h': {'e': {' ': {'s': {'y': {'n': {'t': {'h': {'e': {'s': {'i': {'s': {'~i': [1]}}}}}}}}}}}}}}
+ assert test_trie == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(test_trie))
+
def test_make_recognizer(self):
fields = [
{'name': 'normalizer', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
From b58f83123224684e1e2f1ef3daad8c3cbd47d3f2 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Wed, 16 Sep 2020 00:08:31 -0400
Subject: [PATCH 060/116] remove_node() adjustments
---
pilsner/utility.py | 28 +++++++++-------------------
1 file changed, 9 insertions(+), 19 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 6fb150b..56ef6b2 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -52,27 +52,17 @@ def insert_node(self, label, label_id, entity_id, subtrie, specs, columns, model
subtrie = subtrie[character]
model.store_attributes(label_id, entity_id, subtrie, specs, columns)
- def remove_node(self, model, label, subtrie, prev=0):
- # checks that we haven't hit the end of the word
+ def remove_node(self, model, label, subtrie, prev_length=0):
if label:
- first, rest = label[0], label[1:]
+ head, tail = label[0], label[1:]
current_length = len(subtrie)
- next_length, boo = self.remove_node(model, rest, subtrie=subtrie[first], prev=current_length)
-
- # this statement avoids trimming excessively if the input is a prefix because
- # if the word is a prefix, the first returned value will be greater than 1
- if boo and next_length > 1:
- boo = False
-
- # this statement checks for the first occurrence of the current dict having more than one child
- # or it checks that we've hit the bottom without trimming anything
- elif boo and (current_length > 1 or not prev):
- del subtrie[first]
- boo = False
-
- return current_length, boo
-
- # when we do hit the end of the word, delete _end
+ next_length, bottom = self.remove_node(model, tail, subtrie=subtrie[head], prev_length=current_length)
+ if bottom and next_length > 1:
+ bottom = False
+ elif bottom and (current_length > 1 or not prev_length):
+ del subtrie[head]
+ bottom = False
+ return current_length, bottom
else:
del subtrie[model.ENTITY_KEY]
return len(subtrie) + 1, True
From 3b43a3bcb1af4011a20d98e9c6840718066122d6 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Wed, 16 Sep 2020 22:50:16 -0400
Subject: [PATCH 061/116] notes for insert_node(), remove_node()
---
pilsner/utility.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 56ef6b2..eb2bdbc 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -46,6 +46,7 @@ def compile_dict_specs(self, fields):
return specs
def insert_node(self, label, label_id, entity_id, subtrie, specs, columns, model):
+ # NB: only works with uncompressed trie
for character in label:
if character not in subtrie:
subtrie[character] = {}
@@ -53,6 +54,7 @@ def insert_node(self, label, label_id, entity_id, subtrie, specs, columns, model
model.store_attributes(label_id, entity_id, subtrie, specs, columns)
def remove_node(self, model, label, subtrie, prev_length=0):
+ # NB: only works with uncompressed trie
if label:
head, tail = label[0], label[1:]
current_length = len(subtrie)
From b8f0cbc29622ffe89e8d9e74be310e522bd649f8 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 18 Sep 2020 21:32:48 -0400
Subject: [PATCH 062/116] updated gitignore
---
.gitignore | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index ab5d8ba..cb8f20e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
build/*
cythonized/*
dist/*
+bin/*
*.spec
!**/.gitkeep
-!**/.gitignore
\ No newline at end of file
+!**/.gitignore
From 65180c1032919856e838b3a3f02771d1b1820820 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 18 Sep 2020 21:32:58 -0400
Subject: [PATCH 063/116] building scripts
---
scripts/linux/buildso.sh | 15 +++++++++++++++
scripts/win/makepyd.bat | 2 +-
2 files changed, 16 insertions(+), 1 deletion(-)
create mode 100755 scripts/linux/buildso.sh
diff --git a/scripts/linux/buildso.sh b/scripts/linux/buildso.sh
new file mode 100755
index 0000000..d6d3bd9
--- /dev/null
+++ b/scripts/linux/buildso.sh
@@ -0,0 +1,15 @@
+RUNDIR=`pwd`
+cd `dirname $0`
+MYDIR=`pwd`
+ROOT=${MYDIR}/../..
+ENV=.env.36
+SRC=${ROOT}/pilsner
+DIST=${ROOT}/bin
+TEST=${ROOT}/test
+cd ${ROOT}
+rm -r ${ROOT}/build
+rm -r ${ROOT}/cythonized
+mkdir -p ${DIST}
+${ROOT}/${ENV}/bin/python3 ${TEST}/compile.py build_ext --inplace
+mv ${SRC}/*.so ${DIST}
+cd ${RUNDIR}
diff --git a/scripts/win/makepyd.bat b/scripts/win/makepyd.bat
index 375365f..32fa6fe 100644
--- a/scripts/win/makepyd.bat
+++ b/scripts/win/makepyd.bat
@@ -3,7 +3,7 @@ set RUNDIR=%cd%
set ROOT=%~dp0..\..
set ENV=.env.37
set SRC=%ROOT%\pilsner
-set DIST=%ROOT%\pyd
+set DIST=%ROOT%\bin
set TEST=%ROOT%\test
cd %ROOT%
rmdir /S /Q %ROOT%\build
From fa5f6b052254f41de962b20665afd145cfbacb34 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 19 Sep 2020 00:55:34 -0400
Subject: [PATCH 064/116] pxd headers
---
pilsner/model.pxd | 71 +++++++--
pilsner/utility.pxd | 346 ++++++++++++++++++++++++++++++++++++++++++++
pilsner/utility.py | 14 +-
3 files changed, 414 insertions(+), 17 deletions(-)
create mode 100644 pilsner/utility.pxd
diff --git a/pilsner/model.pxd b/pilsner/model.pxd
index 8f2e45c..8a6f686 100644
--- a/pilsner/model.pxd
+++ b/pilsner/model.pxd
@@ -29,7 +29,10 @@ cdef class Model(dict):
normalizers=cython.dict,
dictionary_number=cython.int
)
- cpdef bint save(self, str filename)
+ cpdef bint save(
+ self,
+ str filename
+ )
@cython.locals(
normalizers=cython.dict,
@@ -37,11 +40,22 @@ cdef class Model(dict):
dictionary=cython.dict,
keywords=cython.dict
)
- cpdef bint load(self, str filename)
+ cpdef bint load(
+ self,
+ str filename
+ )
- cpdef bint add_normalizer(self, str normalizer_name, str filename, bint default=*)
+ cpdef bint add_normalizer(
+ self,
+ str normalizer_name,
+ str filename,
+ bint default=*
+ )
- cpdef bint create_recognizer_schema(self, cursor)
+ cpdef bint create_recognizer_schema(
+ self,
+ cursor
+ )
@cython.locals(
children=cython.dict,
@@ -53,34 +67,71 @@ cdef class Model(dict):
comp_key=cython.str,
comp_children=cython.dict
)
- cpdef tuple pack_subtrie(self, dict trie, bint compressed, str prefix)
+ cpdef tuple pack_subtrie(
+ self,
+ dict trie,
+ bint compressed,
+ str prefix
+ )
@cython.locals(
ret=cython.dict,
normalizer_name=cython.str,
packed=cython.dict
)
- cpdef dict pack_trie(self, dict trie, bint compressed)
+ cpdef dict pack_trie(
+ self,
+ dict trie,
+ bint compressed
+ )
@cython.locals(
k=cython.str
)
- cpdef store_attributes(self, int line_number, int internal_id, dict subtrie, dict specs, list columns)
+ cpdef store_attributes(
+ self,
+ int line_number,
+ int internal_id,
+ dict subtrie,
+ dict specs,
+ list columns
+ )
@cython.locals(
columns=cython.list,
internal_id=cython.int,
entity_id=cython.str
)
- cpdef tuple get_dictionary_line(self, dict specs, dict entity_ids, dict line_numbers, int line_number, str line, str column_separator, str column_enclosure)
+ cpdef tuple get_dictionary_line(
+ self,
+ dict specs,
+ dict entity_ids,
+ dict line_numbers,
+ int line_number,
+ str line,
+ str column_separator,
+ str column_enclosure
+ )
@cython.locals(
synonym=cython.str,
normalizer_name=cython.str
)
- cpdef tuple get_dictionary_synonym(self, list columns, dict specs, str word_separator, int tokenizer_option=*)
+ cpdef tuple get_dictionary_synonym(
+ self,
+ list columns,
+ dict specs,
+ str word_separator,
+ int tokenizer_option=*
+ )
@cython.locals(
new_trie=cython.dict
)
- cpdef dict next_trie(self, dict specs, bint compressed, int tokenizer_option, str word_separator)
+ cpdef dict next_trie(
+ self,
+ dict specs,
+ bint compressed,
+ int tokenizer_option,
+ str word_separator
+ )
diff --git a/pilsner/utility.pxd b/pilsner/utility.pxd
new file mode 100644
index 0000000..4d37194
--- /dev/null
+++ b/pilsner/utility.pxd
@@ -0,0 +1,346 @@
+import cython
+
+cdef class Recognizer():
+
+ cdef public bint debug
+ cdef public bint verbose
+ cdef public logger
+ cdef public callback_status
+ cdef public callback_progress
+
+ cpdef push_message(
+ self,
+ str message,
+ callback_function
+ )
+
+ @cython.locals(
+ specs=cython.dict,
+ i=cython.int,
+ field=cython.dict
+ )
+ cpdef dict compile_dict_specs(
+ self,
+ list fields
+ )
+
+ @cython.locals(
+ character=cython.str
+ )
+ cpdef insert_node(
+ self,
+ str label,
+ int label_id,
+ str entity_id,
+ dict subtrie,
+ dict specs,
+ list columns,
+ model
+ )
+
+ @cython.locals(
+ head=cython.str,
+ tail=cython.str,
+ current_length=cython.int,
+ next_length=cython.int,
+ bottom=cython.bint
+ )
+ cpdef tuple remove_node(
+ self,
+ model,
+ str label,
+ dict subtrie,
+ int prev_length=*
+ )
+
+ @cython.locals(
+ entity_ids=cython.dict,
+ line_numbers=cython.dict,
+ total_bytes=cython.int,
+ increment_bytes=cython.int,
+ this_progress_position=cython.int,
+ last_progress_position=cython.int,
+ ret=cython.list,
+ line_count=cython.int,
+ line_number=cython.int,
+ chars_read=cython.int,
+ trie=cython.dict,
+ line=cython.str,
+ packed=cython.dict,
+ columns=cython.list,
+ internal_id=cython.int,
+ synonym=cython.str,
+ subtrie=cython.dict
+ )
+ cpdef tuple make_recognizer(
+ self,
+ model,
+ str filename,
+ dict specs,
+ str word_separator,
+ int item_limit,
+ bint compressed,
+ str column_separator,
+ str column_enclosure,
+ int tokenizer_option
+ )
+
+ @cython.locals(
+ total_bytes=cython.int,
+ increment_bytes=cython.int,
+ this_progress_position=cython.int,
+ last_progress_position=cython.int,
+ entity_ids=cython.dict,
+ internal_id_map=cython.dict,
+ synonyms=cython.dict,
+ line_count=cython.int,
+ chars_read=cython.int,
+ line=cython.str,
+ columns=cython.list,
+ internal_id=cython.int,
+ synonym=cython.str,
+ overlapping_ids=cython.dict,
+ s=cython.str,
+ tokens=cython.list,
+ keywords=cython.dict
+ )
+ cpdef dict make_keywords(
+ self,
+ model,
+ str filename,
+ dict specs,
+ dict line_numbers,
+ str word_separator,
+ bint disambiguate_all,
+ str column_separator,
+ str column_enclosure,
+ int tokenizer_option
+ )
+
+ @cython.locals(
+ tries=cython.list,
+ line_numbers=cython.dict,
+ keywords=cython.dict
+ )
+ cpdef bint compile_model(
+ self,
+ model,
+ str filename,
+ dict specs,
+ str word_separator,
+ str column_separator,
+ str column_enclosure,
+ bint compressed=*,
+ int item_limit=*,
+ int tokenizer_option=*,
+ bint include_keywords=*,
+ bint disambiguate_all=*
+ )
+
+ @cython.locals(
+ branches=cython.list,
+ radix=cython.str,
+ unpacked_trie=cython.dict,
+ character=cython.str,
+ unpacked_trie_pointer=cython.dict
+ )
+ cpdef dict unpack_trie(
+ self,
+ model,
+ dict packed_trie,
+ bint compressed
+ )
+
+ @cython.locals(
+ attributes=cython.dict,
+ include_attrs=cython.set,
+ exclude_attrs=cython.set,
+ n=cython.int,
+ ns=cython.set,
+ attr_name=cython.str,
+ attr_value=cython.str
+ )
+ cpdef dict unpack_attributes(
+ self,
+ cur,
+ list lead_ids,
+ str include_query,
+ str exclude_query,
+ bint process_exclude,
+ str attrs_out_query
+ )
+
+ cpdef dict check_attrs(
+ self,
+ model,
+ dict trie_leaf,
+ cur,
+ str include_query,
+ str exclude_query,
+ bint process_exclude,
+ str attrs_out_query
+ )
+
+ @cython.locals(
+ rets=cython.list,
+ this_progress_position=cython.int,
+ last_progress_position=cython.int,
+ total_tries=cython.int,
+ progress_share=cython.int,
+ trie_increment=cython.int,
+ current_trie_index=cython.int,
+ trie=cython.dict,
+ ret=cython.list,
+ word_separator=cython.str,
+ start_index=cython.int,
+ end_index=cython.int,
+ string_so_far=cython.str,
+ reading_entity=cython.bint,
+ trie_is_compressed=cython.bint,
+ subtrie=cython.dict,
+ shorter_alternative=cython.tuple,
+ current_index=cython.int,
+ temporary_index=cython.int,
+ total_length=cython.int,
+ increment_chars=cython.int,
+ character=cython.str,
+ found_object=cython.dict,
+ identified=cython.tuple
+ )
+ cpdef list spot_entities(
+ self,
+ model,
+ str source_string,
+ str normalizer_name,
+ str include_query=*,
+ str exclude_query=*,
+ bint process_exclude=*,
+ str attrs_out_query=*,
+ int progress_from=*,
+ int progress_to=*
+ )
+
+ @cython.locals(
+ _recognized=cython.list,
+ id_list=cython.list,
+ k=cython.int,
+ ids=cython.list,
+ si=cython.dict,
+ src=cython.dict,
+ ei=cython.dict,
+ tokens=cython.dict,
+ s_tokens=cython.dict,
+ j=cython.int,
+ tmp=cython.dict,
+ kwd=cython.dict,
+ winner_score=cython.int,
+ winner_id=cython.set,
+ kwd_score=cython.dict,
+ i=cython.int
+ )
+ cpdef list disambiguate(
+ self,
+ model,
+ list recognized,
+ list srcs,
+ str word_separator
+ )
+
+ @cython.locals(
+ spans=cython.dict,
+ srcs=cython.list,
+ i=cython.int,
+ layer=cython.tuple,
+ _map=cython.dict,
+ _r_map=cython.dict,
+ _recognized=cython.list,
+ _src=cython.str,
+ span=cython.tuple,
+ location=cython.tuple,
+ new_layers=cython.list,
+ new_left=cython.int,
+ new_right=cython.int,
+ new_ids=cython.list,
+ new_attrs=cython.dict,
+ new_srcids=cython.list,
+ new_locations=cython.list,
+ new_map=cython.dict,
+ new_r_map=cython.dict,
+ item=cython.tuple,
+ ret=cython.list
+ )
+ cpdef list flatten_layers(
+ self,
+ model,
+ list layers
+ )
+
+ @cython.locals(
+ ret=cython.dict,
+ all_entries=cython.list,
+ span=cython.tuple,
+ _ids=cython.list,
+ _content=cython.dict,
+ _left=cython.int,
+ _right=cython.int,
+ _id=cython.int,
+ _attrs=cython.dict,
+ _attr_name=cython.str,
+ _attr_value=cython.str,
+ filtered_entries=cython.list,
+ i=cython.int,
+ q=cython.tuple,
+ entry=cython.tuple,
+ _location=cython.tuple
+ )
+ cpdef dict flatten_spans(
+ self,
+ list spans
+ )
+
+ @cython.locals(
+ sorted_segments=cython.list,
+ i=cython.int,
+ j=cython.int,
+ recovered=cython.bint,
+ ret=cython.list
+ )
+ cpdef list reduce_spans(
+ self,
+ set segments
+ )
+
+ @cython.locals(
+ attributes=cython.dict,
+ action=cython.str,
+ process_exclude=cython.bint,
+ include_set=cython.set,
+ include_query=cython.str,
+ attr_name=cython.str,
+ attr_value=cython.str,
+ exclude_set=cython.set,
+ exclude_query=cython.str,
+ attrs_out_query=cython.str,
+ rets=cython.list,
+ total_normalizers=cython.int,
+ spot_progress_share=cython.int,
+ current_normalizer_index=cython.int,
+ normalizer_name=cython.str,
+ normalized_string=cython.str,
+ character_map=cython.list,
+ r_character_map=cython.list,
+ progress_from=cython.int,
+ progress_to=cython.int,
+ parsed=cython.list,
+ layers=cython.list,
+ spans=cython.dict,
+ locations=cython.list,
+ ret=cython.dict
+ )
+ cpdef dict parse(
+ self,
+ model,
+ str source_string,
+ dict attrs_where=*,
+ list attrs_out=*
+ )
diff --git a/pilsner/utility.py b/pilsner/utility.py
index eb2bdbc..0e8aebc 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -137,7 +137,7 @@ def make_keywords(self, model, filename, specs, line_numbers, word_separator, di
self.push_message(int(100 * chars_read / total_bytes), self.callback_progress)
columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_count, line, column_separator, column_enclosure)
internal_id_map[line_count] = internal_id
- synonym, _ = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
+ synonym = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)[0]
if synonym not in synonyms:
synonyms[synonym] = set()
synonyms[synonym].add(internal_id)
@@ -154,7 +154,7 @@ def make_keywords(self, model, filename, specs, line_numbers, word_separator, di
for line in f:
columns, internal_id = model.get_dictionary_line(specs, entity_ids, line_numbers, line_count, line, column_separator, column_enclosure)
if internal_id in overlapping_ids:
- synonym, _ = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)
+ synonym = model.get_dictionary_synonym(columns, specs, word_separator, tokenizer_option)[0]
tokens = synonym.split(word_separator)
overlapping_ids[internal_id] = overlapping_ids[internal_id].union(set(tokens))
line_count += 1
@@ -193,18 +193,18 @@ def unpack_trie(self, model, packed_trie, compressed):
def unpack_attributes(self, cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query):
attributes = {}
- include = set()
- exclude = set()
+ include_attrs = set()
+ exclude_attrs = set()
for n in leaf_ids:
rows = cur.execute('select distinct n from attrs where n = %d %s;' % (n, include_query))
for row in rows:
- include.add(int(row[0]))
+ include_attrs.add(int(row[0]))
if process_exclude:
for n in leaf_ids:
rows = cur.execute('select distinct n from attrs where n = %d %s;' % (n, exclude_query))
for row in rows:
- exclude.add(int(row[0]))
- ns = include - exclude
+ exclude_attrs.add(int(row[0]))
+ ns = include_attrs - exclude_attrs
for n in ns:
rows = cur.execute('select attr_name, attr_value from attrs where n = %d%s;' % (n, attrs_out_query))
if n not in attributes:
From 670b20034e50d6b5653470fc79d828df4a6205da Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 19 Sep 2020 00:56:54 -0400
Subject: [PATCH 065/116] building and testiing scripts
---
scripts/linux/buildso.sh | 3 +++
scripts/linux/testing.sh | 1 +
scripts/win/makepyd.bat | 3 +++
scripts/win/testing.bat | 1 +
test/compile.py | 3 ++-
5 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/scripts/linux/buildso.sh b/scripts/linux/buildso.sh
index d6d3bd9..6e1342d 100755
--- a/scripts/linux/buildso.sh
+++ b/scripts/linux/buildso.sh
@@ -9,7 +9,10 @@ TEST=${ROOT}/test
cd ${ROOT}
rm -r ${ROOT}/build
rm -r ${ROOT}/cythonized
+rm -r ${DIST}
mkdir -p ${DIST}
${ROOT}/${ENV}/bin/python3 ${TEST}/compile.py build_ext --inplace
mv ${SRC}/*.so ${DIST}
+cp ${SRC}/__init__.py ${DIST}
+cp ${SRC}/*.xml ${DIST}
cd ${RUNDIR}
diff --git a/scripts/linux/testing.sh b/scripts/linux/testing.sh
index ec23305..9c825ce 100755
--- a/scripts/linux/testing.sh
+++ b/scripts/linux/testing.sh
@@ -1,6 +1,7 @@
RUNDIR=`pwd`
cd `dirname $0`
MYDIR=`pwd`
+${MYDIR}/buildso.sh
ROOT=${MYDIR}/../..
ENV=.env.36
TEST=${ROOT}/test
diff --git a/scripts/win/makepyd.bat b/scripts/win/makepyd.bat
index 32fa6fe..40c8a31 100644
--- a/scripts/win/makepyd.bat
+++ b/scripts/win/makepyd.bat
@@ -8,7 +8,10 @@ set TEST=%ROOT%\test
cd %ROOT%
rmdir /S /Q %ROOT%\build
rmdir /S /Q %ROOT%\cythonized
+rmdir /S /Q %DIST%
if not exist %DIST%\nul mkdir %DIST%
call %ROOT%\%ENV%\Scripts\python %TEST%\compile.py build_ext --inplace
move /Y %SRC%\*.pyd %DIST%\
+copy /Y %SRC%\__init__.py %DIST%\
+copy /Y %SRC%\*.xml %DIST%\
cd %RUNDIR%
diff --git a/scripts/win/testing.bat b/scripts/win/testing.bat
index ae32a49..6b09633 100644
--- a/scripts/win/testing.bat
+++ b/scripts/win/testing.bat
@@ -1,4 +1,5 @@
@echo off
+call %~dp0\buildpyd.bat
set RUNDIR=%cd%
set ROOT=%~dp0..\..
set ENV=.env.37
diff --git a/test/compile.py b/test/compile.py
index 874dc19..6eace6c 100644
--- a/test/compile.py
+++ b/test/compile.py
@@ -2,7 +2,8 @@
from Cython.Build import cythonize
src = [
- 'pilsner/model.py'
+ 'pilsner/model.py',
+ 'pilsner/utility.py'
]
setup(ext_modules=cythonize(src, compiler_directives={'language_level': '3'}, build_dir='cythonized'))
From a1b717c8184eafacecdb53996fa53f1674bb015b Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 19 Sep 2020 00:57:13 -0400
Subject: [PATCH 066/116] testing for source code and compiled libs
---
test/performance.py | 4 ++--
test/sandbox.py | 2 +-
test/ut_model.py | 12 +++++++++---
test/ut_utility.py | 12 +++++++++---
4 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/test/performance.py b/test/performance.py
index 4ed8368..edf8059 100644
--- a/test/performance.py
+++ b/test/performance.py
@@ -237,6 +237,6 @@ def cleanup():
if __name__ == '__main__':
create_test_dataset()
- perf_compile_model_save_model(['pilsner'])
- perf_load_model_parse_test(['pilsner'])
+ perf_compile_model_save_model(['pilsner', 'bin'])
+ perf_load_model_parse_test(['pilsner', 'bin'])
cleanup()
diff --git a/test/sandbox.py b/test/sandbox.py
index 4d4a633..534d6f3 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -43,7 +43,7 @@ def load_it():
s = 'this is awesome white refrigerators o refrigerator, is it not'
s *= 10
_messages.clear()
- #q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
+ q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
#print(q)
save_it()
diff --git a/test/ut_model.py b/test/ut_model.py
index 7a4fcaf..a7b9c19 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -1,7 +1,6 @@
import os
-import sys; sys.path.insert(0, '')
+import sys
import unittest
-import pilsner # pylint: disable=E0611,F0401
class TestModel(unittest.TestCase):
@@ -168,4 +167,11 @@ def test_next_trie(self):
assert got_trie == expected, 'Expected %s, got %s' % (str(expected), str(got_trie))
if __name__ == '__main__':
- unittest.main()
+ sys.path.insert(0, '')
+ import pilsner # pylint: disable=E0611,F0401
+ unittest.main(exit=False)
+ try:
+ import bin as pilsner # pylint: disable=E0611,F0401
+ unittest.main()
+ except ModuleNotFoundError:
+ print('Could not import module from /bin, test skipped.')
diff --git a/test/ut_utility.py b/test/ut_utility.py
index 45feb6e..ae3b028 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -1,6 +1,5 @@
-import sys; sys.path.insert(0, '')
+import sys
import unittest
-import pilsner # pylint: disable=E0611,F0401
class TestUtility(unittest.TestCase):
@@ -391,4 +390,11 @@ def test_parse(self):
assert output == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(output))
if __name__ == '__main__':
- unittest.main()
+ sys.path.insert(0, '')
+ import pilsner # pylint: disable=E0611,F0401
+ unittest.main(exit=False)
+ try:
+ import bin as pilsner # pylint: disable=E0611,F0401
+ unittest.main()
+ except ModuleNotFoundError:
+ print('Could not import module from /bin, test skipped.')
From c151f3fb748317b02adaa6346a62f93221a20bf9 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 19 Sep 2020 01:04:50 -0400
Subject: [PATCH 067/116] renamed makepyd -> buildpyd
---
scripts/win/{makepyd.bat => buildpyd.bat} | 0
1 file changed, 0 insertions(+), 0 deletions(-)
rename scripts/win/{makepyd.bat => buildpyd.bat} (100%)
diff --git a/scripts/win/makepyd.bat b/scripts/win/buildpyd.bat
similarity index 100%
rename from scripts/win/makepyd.bat
rename to scripts/win/buildpyd.bat
From 39d43831b146a9842c1ceb2079e18418d9127c3e Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 19 Sep 2020 19:34:35 -0400
Subject: [PATCH 068/116] types
---
pilsner/model.pxd | 1 -
pilsner/model.py | 8 ++++----
test/ut_model.py | 6 +++++-
3 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/pilsner/model.pxd b/pilsner/model.pxd
index 8a6f686..08dcf8c 100644
--- a/pilsner/model.pxd
+++ b/pilsner/model.pxd
@@ -61,7 +61,6 @@ cdef class Model(dict):
children=cython.dict,
child_count=cython.int,
key=cython.str,
- child=cython.dict,
next_prefix=cython.str,
comp_child=cython.dict,
comp_key=cython.str,
diff --git a/pilsner/model.py b/pilsner/model.py
index 4ae5f5a..7c0cfa6 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -135,20 +135,20 @@ def create_recognizer_schema(self, cursor):
def pack_subtrie(self, trie, compressed, prefix):
if not compressed:
return trie, prefix
- if type(trie) != dict:
- return trie, prefix
+ # if type(trie) != dict:
+ # return trie, prefix
if prefix == self.ENTITY_KEY:
return trie, prefix
children = trie
child_count = int(len(children))
if child_count == 1:
- for key, child in children.items():
+ for key in children:
if key == self.ENTITY_KEY:
if len(prefix) > 1:
return {prefix[1:]: trie}, prefix[0]
return trie, prefix
next_prefix = prefix + key
- comp_child, comp_key = self.pack_subtrie(child, compressed, next_prefix)
+ comp_child, comp_key = self.pack_subtrie(children[key], compressed, next_prefix)
if prefix == '':
comp_children = {comp_key: comp_child}
else:
diff --git a/test/ut_model.py b/test/ut_model.py
index a7b9c19..c096620 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -172,6 +172,10 @@ def test_next_trie(self):
unittest.main(exit=False)
try:
import bin as pilsner # pylint: disable=E0611,F0401
- unittest.main()
+ #unittest.main()
+ ut = TestModel()
+ ut.setUp()
+ ut.test_del()
+ ut.tearDown()
except ModuleNotFoundError:
print('Could not import module from /bin, test skipped.')
From ffe44e6284e7aa6b5d7ec0749a46e345d6739390 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 19 Sep 2020 21:20:13 -0400
Subject: [PATCH 069/116] model destroyer
---
pilsner/model.py | 8 +++++++-
test/ut_model.py | 10 +++++-----
2 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index 7c0cfa6..34fea63 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -53,12 +53,18 @@ def __init__(self, filename='', storage_location='', debug_mode=False, verbose_m
if filename != '':
self.load(filename)
- def __del__(self):
+ def destroy(self):
# remove all temporary resources
self.connection.close()
if os.path.exists(self.DEFAULT_DATASOURCE):
os.remove(self.DEFAULT_DATASOURCE)
+ def __del__(self):
+ try:
+ self.destroy()
+ except:
+ pass
+
def save(self, filename):
assert os.path.exists(self[self.DATASOURCE_KEY]), 'Cannot find temporary database on disk'
logging.debug('Saving model "%s"' % (filename))
diff --git a/test/ut_model.py b/test/ut_model.py
index c096620..01b2153 100644
--- a/test/ut_model.py
+++ b/test/ut_model.py
@@ -8,6 +8,7 @@ def setUp(self):
self.model = pilsner.Model()
def tearDown(self):
+ self.model.destroy()
del(self.model)
def test_init(self):
@@ -16,10 +17,12 @@ def test_init(self):
assert type(m) == pilsner.Model, 'Model is expected to have pilsner.Model type, but has %s instead' % (str(type(m)))
storage = m.DEFAULT_DATASOURCE
assert storage.lower() == ':memory:' or os.path.exists(storage), 'Model storage is not where it is supposed to be'
+ m.destroy()
def test_del(self):
m = pilsner.Model()
storage = m.DEFAULT_DATASOURCE
+ m.destroy()
del(m)
assert 'm' not in locals(), 'Instance of Model class has not been destroyed'
assert storage.lower() == ':memory:' or not os.path.exists(storage), 'Model storage is supposed to be removed once class has been destroyed'
@@ -44,6 +47,7 @@ def test_load(self):
another_model = pilsner.Model()
another_model.load('./.test_load')
assert another_model[another_model.DICTIONARY_KEY] == expected, 'Loaded model %s != saved model %s' % (str(another_model[another_model.DICTIONARY_KEY]), str(expected))
+ another_model.destroy()
del(another_model)
os.remove('./.test_load.0.dictionary')
os.remove('./.test_load.1.dictionary')
@@ -172,10 +176,6 @@ def test_next_trie(self):
unittest.main(exit=False)
try:
import bin as pilsner # pylint: disable=E0611,F0401
- #unittest.main()
- ut = TestModel()
- ut.setUp()
- ut.test_del()
- ut.tearDown()
+ unittest.main()
except ModuleNotFoundError:
print('Could not import module from /bin, test skipped.')
From 3fddd44b83a21682544dddf104032bef46eff1f0 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 19 Sep 2020 22:17:47 -0400
Subject: [PATCH 070/116] pxd headers (wip)
---
pilsner/model.pxd | 3 +--
pilsner/model.py | 6 ++++--
pilsner/utility.pxd | 8 ++++----
pilsner/utility.py | 2 +-
test/ut_utility.py | 23 +++++++++++++++--------
5 files changed, 25 insertions(+), 17 deletions(-)
diff --git a/pilsner/model.pxd b/pilsner/model.pxd
index 08dcf8c..d3655ec 100644
--- a/pilsner/model.pxd
+++ b/pilsner/model.pxd
@@ -62,13 +62,12 @@ cdef class Model(dict):
child_count=cython.int,
key=cython.str,
next_prefix=cython.str,
- comp_child=cython.dict,
comp_key=cython.str,
comp_children=cython.dict
)
cpdef tuple pack_subtrie(
self,
- dict trie,
+ trie,
bint compressed,
str prefix
)
diff --git a/pilsner/model.py b/pilsner/model.py
index 34fea63..49d8761 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -154,6 +154,8 @@ def pack_subtrie(self, trie, compressed, prefix):
return {prefix[1:]: trie}, prefix[0]
return trie, prefix
next_prefix = prefix + key
+ if not isinstance(children[key], dict):
+ return children[key], next_prefix
comp_child, comp_key = self.pack_subtrie(children[key], compressed, next_prefix)
if prefix == '':
comp_children = {comp_key: comp_child}
@@ -162,8 +164,8 @@ def pack_subtrie(self, trie, compressed, prefix):
return comp_children, comp_key
else:
comp_children = {}
- for key, child in children.items():
- comp_child, comp_key = self.pack_subtrie(child, compressed, key)
+ for key in children:
+ comp_child, comp_key = self.pack_subtrie(children[key], compressed, key)
comp_children[comp_key] = comp_child
if len(prefix) > 1:
comp_children = {prefix[0]: {prefix[1:]: comp_children}}
diff --git a/pilsner/utility.pxd b/pilsner/utility.pxd
index 4d37194..320b3fc 100644
--- a/pilsner/utility.pxd
+++ b/pilsner/utility.pxd
@@ -10,7 +10,7 @@ cdef class Recognizer():
cpdef push_message(
self,
- str message,
+ message,
callback_function
)
@@ -31,7 +31,7 @@ cdef class Recognizer():
self,
str label,
int label_id,
- str entity_id,
+ int entity_id,
dict subtrie,
dict specs,
list columns,
@@ -251,8 +251,8 @@ cdef class Recognizer():
srcs=cython.list,
i=cython.int,
layer=cython.tuple,
- _map=cython.dict,
- _r_map=cython.dict,
+ _map=cython.list,
+ _r_map=cython.list,
_recognized=cython.list,
_src=cython.str,
span=cython.tuple,
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 0e8aebc..cb8fdf7 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -492,7 +492,7 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
current_normalizer_index += 1
layers = self.flatten_layers(model, rets)
spans = self.flatten_spans(layers)
- locations = self.reduce_spans(spans.keys())
+ locations = self.reduce_spans(set(spans.keys()))
ret = {location: spans[location] for location in locations}
self.logger('Done parsing text.')
return ret
diff --git a/test/ut_utility.py b/test/ut_utility.py
index ae3b028..ff63988 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -5,9 +5,12 @@ class TestUtility(unittest.TestCase):
def setUp(self):
self.recognizer = pilsner.Recognizer()
+ self.model = pilsner.Model()
def tearDown(self):
del(self.recognizer)
+ self.model.destroy()
+ del(self.model)
def compile_test_model(self):
fields = [
@@ -17,7 +20,7 @@ def compile_test_model(self):
{'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
specs = self.recognizer.compile_dict_specs(fields)
- model = pilsner.Model()
+ model = self.model
model.add_normalizer('t1', 'test/assets/tokenizer1.xml')
model.add_normalizer('t2', 'test/assets/tokenizer2.xml')
model.normalizer_map = {
@@ -76,17 +79,17 @@ def test_insert_node(self):
{'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
specs = self.recognizer.compile_dict_specs(fields)
- model = pilsner.Model()
+ model = self.model
model.create_recognizer_schema(model.cursor)
test_trie = {}
- self.recognizer.insert_node(label='the synonym', label_id=1, entity_id='10', subtrie=test_trie, specs=specs, columns=['', '', '', ''], model=model)
- self.recognizer.insert_node('the synthesis', 2, '20', test_trie, specs, ['', '', '', ''], model)
+ self.recognizer.insert_node(label='the synonym', label_id=1, entity_id=10, subtrie=test_trie, specs=specs, columns=['', '', '', ''], model=model)
+ self.recognizer.insert_node('the synthesis', 2, 20, test_trie, specs, ['', '', '', ''], model)
expected = {'t': {'h': {'e': {' ': {'s': {'y': {'n': {'o': {'n': {'y': {'m': {'~i': [1]}}}}, 't': {'h': {'e': {'s': {'i': {'s': {'~i': [2]}}}}}}}}}}}}}}
assert test_trie == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(test_trie))
def test_remove_node(self):
test_trie = {'t': {'h': {'e': {' ': {'s': {'y': {'n': {'o': {'n': {'y': {'m': {'~i': [1]}}}}, 't': {'h': {'e': {'s': {'i': {'s': {'~i': [1]}}}}}}}}}}}}}}
- model = pilsner.Model()
+ model = self.model
self.recognizer.remove_node(model=model, label='the synonym', subtrie=test_trie)
expected = {'t': {'h': {'e': {' ': {'s': {'y': {'n': {'t': {'h': {'e': {'s': {'i': {'s': {'~i': [1]}}}}}}}}}}}}}}
assert test_trie == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(test_trie))
@@ -99,7 +102,7 @@ def test_make_recognizer(self):
{'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
specs = self.recognizer.compile_dict_specs(fields)
- model = pilsner.Model()
+ model = self.model
got_recognizer, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', column_enclosure='', tokenizer_option=0)
expected_recognizer = [
{
@@ -144,7 +147,7 @@ def test_make_keywords(self):
{'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
specs = self.recognizer.compile_dict_specs(fields)
- model = pilsner.Model()
+ model = self.model
_, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', column_enclosure='', tokenizer_option=0)
keywords = self.recognizer.make_keywords(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, line_numbers=got_line_numbers, word_separator=' ', disambiguate_all=False, column_separator='\t', column_enclosure='', tokenizer_option=0)
expected = {
@@ -372,7 +375,7 @@ def test_flatten_spans(self):
assert flattened == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(flattened))
def test_reduce_spans(self):
- segments = [tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])]
+ segments = set([tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])])
expected = [tuple([1, 6])]
reduced = self.recognizer.reduce_spans(segments)
assert reduced == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(reduced))
@@ -396,5 +399,9 @@ def test_parse(self):
try:
import bin as pilsner # pylint: disable=E0611,F0401
unittest.main()
+ # x = TestUtility()
+ # x.setUp()
+ # x.compile_test_model()
+ # x.tearDown()
except ModuleNotFoundError:
print('Could not import module from /bin, test skipped.')
From d89bed1e4c7dbd0aa1a567fc12a8198cd6e9aacc Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 20 Sep 2020 00:33:58 -0400
Subject: [PATCH 071/116] destroy model after performance testing
---
test/performance.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/test/performance.py b/test/performance.py
index edf8059..35640e0 100644
--- a/test/performance.py
+++ b/test/performance.py
@@ -151,6 +151,7 @@ def perf_compile_model_save_model(modules_to_test):
""" % (x),
stmt="""
recognizer.compile_model(model, '.test-dict.txt', specs, ' ', '\\t', '\\n', include_keywords=True)
+model.destroy()
""",
number=n
)
@@ -177,6 +178,7 @@ def perf_compile_model_save_model(modules_to_test):
""" % (x),
stmt="""
model.save('.test-model')
+model.destroy()
""",
number=n
)
@@ -196,6 +198,7 @@ def perf_load_model_parse_test(modules_to_test):
""" % (x),
stmt="""
model.load('.test-model')
+model.destroy()
""",
number=n
)
@@ -217,6 +220,7 @@ def perf_load_model_parse_test(modules_to_test):
""" % (x),
stmt="""
found = recognizer.parse(model, test_text)
+model.destroy()
""",
number=n
)
From 42fadf71f6ab812dd99b142965b9f9260e609b69 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 20 Sep 2020 00:34:17 -0400
Subject: [PATCH 072/116] explicit type conversion
---
pilsner/utility.py | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index cb8fdf7..c3af781 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -57,7 +57,7 @@ def remove_node(self, model, label, subtrie, prev_length=0):
# NB: only works with uncompressed trie
if label:
head, tail = label[0], label[1:]
- current_length = len(subtrie)
+ current_length = int(len(subtrie))
next_length, bottom = self.remove_node(model, tail, subtrie=subtrie[head], prev_length=current_length)
if bottom and next_length > 1:
bottom = False
@@ -90,7 +90,7 @@ def make_recognizer(self, model, filename, specs, word_separator, item_limit, co
chars_read = 0
trie = model.next_trie(specs, compressed, tokenizer_option, word_separator)
for line in f:
- chars_read += len(line)
+ chars_read += int(len(line))
this_progress_position = int(chars_read / increment_bytes)
if this_progress_position != last_progress_position:
last_progress_position = this_progress_position
@@ -130,7 +130,7 @@ def make_keywords(self, model, filename, specs, line_numbers, word_separator, di
line_count = 0
chars_read = 0
for line in f:
- chars_read += len(line)
+ chars_read += int(len(line))
this_progress_position = int(chars_read / increment_bytes)
if this_progress_position != last_progress_position:
last_progress_position = this_progress_position
@@ -218,7 +218,7 @@ def unpack_attributes(self, cur, leaf_ids, include_query, exclude_query, process
def check_attrs(self, model, trie_leaf, cur, include_query, exclude_query, process_exclude, attrs_out_query):
trie_leaf[model.ATTRS_KEY] = self.unpack_attributes(cur, trie_leaf[model.ENTITY_KEY], include_query, exclude_query, process_exclude, attrs_out_query)
- if len(trie_leaf[model.ATTRS_KEY]) == 0:
+ if int(len(trie_leaf[model.ATTRS_KEY])) == 0:
return {}
return trie_leaf
@@ -242,7 +242,7 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
shorter_alternative = None
current_index = 0
temporary_index = -1
- total_length = len(source_string)
+ total_length = int(len(source_string))
increment_chars = int(total_length / progress_share) if total_length > progress_share else total_length - 1
while current_index < total_length:
this_progress_position = int(current_index / increment_chars / total_tries)
@@ -478,7 +478,7 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
self.logger('Parsing text...')
self.push_message('Parsing text', self.callback_status)
rets = []
- total_normalizers = len(model[model.NORMALIZER_KEY])
+ total_normalizers = int(len(model[model.NORMALIZER_KEY]))
spot_progress_share = int(100 / total_normalizers)
current_normalizer_index = 0
for normalizer_name in model[model.NORMALIZER_KEY]:
From 170cbb200bba8400b8efb10b0db5e7b80f0c662f Mon Sep 17 00:00:00 2001
From: Pavel Golovatenko-Abramov
Date: Mon, 21 Sep 2020 00:49:02 -0400
Subject: [PATCH 073/116] docstrings for model.py (wip)
---
pilsner/model.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index 49d8761..7885d2e 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -8,6 +8,7 @@
import shutil
class Model(dict):
+ """This class is a dict that stores tries and metadata, and provides functions and methods associated with the storage."""
def __init__(self, filename='', storage_location='', debug_mode=False, verbose_mode=False):
@@ -54,7 +55,7 @@ def __init__(self, filename='', storage_location='', debug_mode=False, verbose_m
self.load(filename)
def destroy(self):
- # remove all temporary resources
+ """Close connection, remove temporary database"""
self.connection.close()
if os.path.exists(self.DEFAULT_DATASOURCE):
os.remove(self.DEFAULT_DATASOURCE)
From 6ec889873094ebeca32464c580decfbc785a5c45 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 22 Sep 2020 22:54:39 -0400
Subject: [PATCH 074/116] docstrings for model.py
---
pilsner/model.py | 96 +++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 95 insertions(+), 1 deletion(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index 7885d2e..82e0e11 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -11,7 +11,12 @@ class Model(dict):
"""This class is a dict that stores tries and metadata, and provides functions and methods associated with the storage."""
def __init__(self, filename='', storage_location='', debug_mode=False, verbose_mode=False):
+ """Creates Model instance.
+ Args:
+ str *filename*: if provided, loads model from disk, see load() method
+ str *storage_location*:
+ """
self.CONTENT_KEY = '~content'
self.SPECS_KEY = '~specs'
self.COMPRESSED_KEY = '~compressed'
@@ -55,18 +60,31 @@ def __init__(self, filename='', storage_location='', debug_mode=False, verbose_m
self.load(filename)
def destroy(self):
- """Close connection, remove temporary database"""
+ """Closes connection, removes temporary database."""
self.connection.close()
if os.path.exists(self.DEFAULT_DATASOURCE):
os.remove(self.DEFAULT_DATASOURCE)
def __del__(self):
+ """Desctructor"""
try:
self.destroy()
except:
pass
def save(self, filename):
+ """Saves model to disk.
+ Note: this will throw exception if temporary database is stored in memory.
+
+ Args:
+ str *filename*: path and filename prefix for names of files that will be written.
+
+ Example: model.save('filename') will write the following files:
+ filename.normalizers
+ filename.*.dictionary (can be multiple files, depends on model settings)
+ filename.keywords
+ filename.attributes
+ """
assert os.path.exists(self[self.DATASOURCE_KEY]), 'Cannot find temporary database on disk'
logging.debug('Saving model "%s"' % (filename))
self.cursor.close()
@@ -95,6 +113,17 @@ def save(self, filename):
return True
def load(self, filename):
+ """Loads model from disk.
+
+ Args:
+ str *filename*: path and filename prefix for names of files that represent the model on disk.
+
+ Example: model.load('filename') will attempt reading following files:
+ filename.normalizers
+ filename.*.dictionary
+ filename.keywords
+ filename.attributes
+ """
logging.debug('Loading model "%s"' % (filename))
self[self.DATASOURCE_KEY] = '%s.attributes' % (filename)
self.cursor.close()
@@ -124,6 +153,13 @@ def load(self, filename):
return True
def add_normalizer(self, normalizer_name, filename, default=False):
+ """Adds normalization unit to the model.
+
+ Args:
+ str *normalizer_name*: name of normalization unit
+ str *filename*: path and name of configuration file
+ bool *default*: if True, model will use this normalization unit by default
+ """
logging.debug('Adding normalizer "%s" from "%s"' % (normalizer_name, filename))
normalizer = self.sic_builder.build_normalizer(filename)
self[self.NORMALIZER_KEY][normalizer_name] = normalizer
@@ -134,12 +170,25 @@ def add_normalizer(self, normalizer_name, filename, default=False):
return True
def create_recognizer_schema(self, cursor):
+ """Creates tables in the database that stores attributes of entities.
+
+ Args:
+ sqlite3.connect.cursor *cursor*: cursor to use for throwing queries
+ """
logging.debug('Creating schema for permanent storage')
cursor.execute('create table attrs (n integer, iid integer, attr_name text, attr_value text);')
logging.debug('Created schema for permanent storage')
return True
def pack_subtrie(self, trie, compressed, prefix):
+ """Recursively compresses a trie.
+ Returns tuple (dict compressed_trie, str prefix).
+
+ Args:
+ dict *trie*: object representing a trie
+ bool *compressed*: whether a given trie must be compressed
+ str *prefix*: compressed prefix of a branch
+ """
if not compressed:
return trie, prefix
# if type(trie) != dict:
@@ -174,6 +223,13 @@ def pack_subtrie(self, trie, compressed, prefix):
return comp_children, prefix
def pack_trie(self, trie, compressed):
+ """Compresses all tries in a model.
+ Returns dict that contains all compressed tries.
+
+ Args:
+ dict *trie*: part of model that contains tries
+ bool *compressed*: whether tries in a given structure must be compressed
+ """
ret = {k: trie[k] for k in trie if k != self.CONTENT_KEY}
ret[self.CONTENT_KEY] = {}
for normalizer_name in trie[self.CONTENT_KEY]:
@@ -182,6 +238,15 @@ def pack_trie(self, trie, compressed):
return ret
def store_attributes(self, line_number, internal_id, subtrie, specs, columns):
+ """Flags terminus of a trie and writes attributes of an entry to the temporary database.
+
+ Args:
+ int *line_number*: number of line in a file that is supposed to be being read
+ int *internal_id*: internally assigned ID of an entity
+ dict *subtrie*: subtrie that is being constructed
+ dict *specs*: specs for columns in a file that is supposed to be being read
+ list *columns*: values in columns (attributes) in a file that is supposed to be being read
+ """
if self.ENTITY_KEY not in subtrie:
subtrie[self.ENTITY_KEY] = []
subtrie[self.ENTITY_KEY].append(line_number)
@@ -194,6 +259,18 @@ def store_attributes(self, line_number, internal_id, subtrie, specs, columns):
_ = [self.cursor.execute('insert into attrs (n, iid, attr_name, attr_value) select ?, ?, ?, ?;', (line_number, internal_id, k, s)) for s in set(columns[specs['fields'][k][0]].split(specs['fields'][k][1]))]
def get_dictionary_line(self, specs, entity_ids, line_numbers, line_number, line, column_separator, column_enclosure):
+ """Extracts values of columns in a file and associates them with internal entity ID.
+ Returns tuple (list *column_values*, int *internal_id*).
+
+ Args:
+ dict *specs*: specs for columns in a file that is supposed to be being read
+ dict *entity_ids*: map between real entity IDs and internally generated entity IDs
+ dict *line_numbers*: map between line numbers and internally generated entity IDs
+ int *line_number*: number of line that is being parsed
+ str *line*: line that is being parsed
+ str *column_separator*: delimiter to split columns
+ str *column_enclosure*: any string that columns are supposed to be trimmed of
+ """
columns = [x.strip(column_enclosure) for x in line.strip('\n').split(column_separator)]
if line_number in line_numbers:
internal_id = line_numbers[line_number]
@@ -206,6 +283,15 @@ def get_dictionary_line(self, specs, entity_ids, line_numbers, line_number, line
return columns, internal_id
def get_dictionary_synonym(self, columns, specs, word_separator, tokenizer_option=0):
+ """Extracts normalized synonym from list of column values.
+ Returns tuple (str *normalized_synonym*, str *normalization_unit_name*).
+
+ Args:
+ list *columns*: list of columns in a file that is supposed to be being read
+ dict *specs*: specs for columns in a file that is supposed to be being read
+ str *word_separator*: word separator to use for tokenization
+ int *tokenizer_option*: tokenizer mode (see documentation for normalization for details)
+ """
synonym, normalizer_name = columns[specs['value'][0]], None
if self[self.NORMALIZER_KEY]:
if specs['tokenizer']:
@@ -220,6 +306,14 @@ def get_dictionary_synonym(self, columns, specs, word_separator, tokenizer_optio
return synonym, normalizer_name
def next_trie(self, specs, compressed, tokenizer_option, word_separator):
+ """Creates and returns dict that contains empty trie and metadata.
+
+ Args:
+ dict *specs*: specs for columns in a file that is supposed to be being read for trie construction
+ bool *compressed*: whether constructed trie(s) must be compressed
+ int *tokenizer_option*: tokenizer mode (see documentation for normalization for details)
+ str *word_separator*: word separator to use for tokenization
+ """
if len(self[self.NORMALIZER_KEY]) == 0:
self.add_normalizer('bypass', '%s/normalizer.bypass.xml' % (os.path.abspath(os.path.dirname(__file__))))
new_trie = {
From 6cfdf7a7dd44fec0cd7e4be6d824d8b91ea9b8cc Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 24 Sep 2020 00:50:42 -0400
Subject: [PATCH 075/116] docstring for utility (wip)
---
pilsner/model.py | 2 +-
pilsner/utility.py | 60 +++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 55 insertions(+), 7 deletions(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index 82e0e11..ca2d94d 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -66,7 +66,7 @@ def destroy(self):
os.remove(self.DEFAULT_DATASOURCE)
def __del__(self):
- """Desctructor"""
+ """Desctructor."""
try:
self.destroy()
except:
diff --git a/pilsner/utility.py b/pilsner/utility.py
index c3af781..e03611a 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -2,8 +2,17 @@
import os
class Recognizer():
+ """This class is the utility for named entity recognition."""
def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, callback_progress=None):
+ """Creates Recognizer instance.
+
+ Args:
+ bool *debug_mode*: toggle logger level to INFO
+ bool *verbose_mode*: toggle logger level to DEBUG
+ function *callback_status*: callback function that message about status can be passed to
+ function *callback_progress*: callback function that message about progress can be passed to
+ """
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s')
self.debug = debug_mode
self.verbose = verbose_mode
@@ -18,19 +27,37 @@ def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, c
logging.debug('Recognizer class has been initialized')
def __del__(self):
- # remove all temporary resources
+ """Destructor."""
pass
def push_message(self, message, callback_function):
+ """Passes message to callback_function.
+
+ Args:
+ str *message*: message to pass
+ function *callback_function*: function to take *message* as an argument
+ """
if callback_function is not None:
callback_function(message)
def compile_dict_specs(self, fields):
+ """Reshapes list of fields' specifications into dict used by other members of Recognizer class.
+ Returns new dict with specifications.
+
+ Args:
+ list *fields*: list of fields (columns)
+
+ Each member of *fields* list must be a dict as follows: {
+ 'name': 'str name of attribute',
+ 'include': bool True for including this column else False,
+ 'delimiter': 'str delimiter in case column stores concatenated lists',
+ 'id_flag': bool True if column stores entity ID else False,
+ 'normalizer_flag': bool True if column stores string normalizer tag else False,
+ 'value_flag': bool True if column stores string label to recognize else False
+ }
+ """
logging.debug('Compiling specs')
specs = {'fields': {}, 'id': None, 'tokenizer': None, 'value': None}
- # {'name': 'DType', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
- # specs = {'DType': (0, None, False, True, False), 'MSID': (1, None, True, False, False), 'value': (2, None, False, False, True)}
- # specs = {'attr_name': (column_index, delimiter, normalizer_flag, value_flag)}
for i in range(0, len(fields)):
field = fields[i]
if not field['include']:
@@ -46,7 +73,19 @@ def compile_dict_specs(self, fields):
return specs
def insert_node(self, label, label_id, entity_id, subtrie, specs, columns, model):
- # NB: only works with uncompressed trie
+ """Inserts string into trie structure represented by dict object.
+
+ Args:
+ str *label*: string to insert
+ int *label_id*: ID of the label
+ int *entity_id*: ID of the entity given label belongs to
+ dict *subtrie*: object representing the trie
+ dict *specs*: dictionary specifications
+ list *columns*: list of values associated with the entity
+ *model*: instance of Model class handling the trie and metadata
+
+ NB: only works with uncompressed trie!
+ """
for character in label:
if character not in subtrie:
subtrie[character] = {}
@@ -54,7 +93,16 @@ def insert_node(self, label, label_id, entity_id, subtrie, specs, columns, model
model.store_attributes(label_id, entity_id, subtrie, specs, columns)
def remove_node(self, model, label, subtrie, prev_length=0):
- # NB: only works with uncompressed trie
+ """Removes string from trie structure represented by dict object.
+
+ Args:
+ *model*: instance of Model class handling the trie and metadata
+ str *label*: string to remove
+ dict *subtrie*: object representing the trie
+ int *pref_length*: length of substring found in the trie (used with recursion)
+
+ NB: only works with uncompressed trie!
+ """
if label:
head, tail = label[0], label[1:]
current_length = int(len(subtrie))
From e52a36f767b30250209d21eedf5e3ac53201dc5a Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 25 Sep 2020 01:04:32 -0400
Subject: [PATCH 076/116] docstrings for utility
---
pilsner/utility.py | 78 ++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 76 insertions(+), 2 deletions(-)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index e03611a..a1d52c2 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -96,7 +96,7 @@ def remove_node(self, model, label, subtrie, prev_length=0):
"""Removes string from trie structure represented by dict object.
Args:
- *model*: instance of Model class handling the trie and metadata
+ Model *model*: instance of Model class handling the trie and metadata
str *label*: string to remove
dict *subtrie*: object representing the trie
int *pref_length*: length of substring found in the trie (used with recursion)
@@ -118,6 +118,20 @@ def remove_node(self, model, label, subtrie, prev_length=0):
return len(subtrie) + 1, True
def make_recognizer(self, model, filename, specs, word_separator, item_limit, compressed, column_separator, column_enclosure, tokenizer_option):
+ """Reads tab-delimited text file, populates dict objects representing tries, and fills database associated with a given Model instance according to provided specs.
+ Returns tuple(list *tries*, dict *line_numbers*) where *tries* are populated dicts representing tries, *line_numbers* is dict that maps line numbers from the text file to internally generated entity IDs.
+
+ Args:
+ Model *model*: Model instance to populate
+ str *filename*: path and name of tab-delimited text file with the content
+ dict *specs*: specifications for columns in the text file
+ str *word_separator*: string considered to be the word delimiter
+ int *item_limit*: maximum number of rows to stuff in a single trie of a model
+ bool *compressed*: whether given tries must be compressed
+ str *column_separator*: delimiter to split columns
+ str *column_enclosure*: any string that columns are supposed to be trimmed of
+ int *tokenizer_option*: tokenizer mode (see documentation for normalization for details)
+ """
# TODO: review for refactoring
self.logger('Making recognizer using %s' % (filename))
self.push_message('Making recognizer using %s' % (filename), self.callback_status)
@@ -165,6 +179,20 @@ def make_recognizer(self, model, filename, specs, word_separator, item_limit, co
return ret, line_numbers
def make_keywords(self, model, filename, specs, line_numbers, word_separator, disambiguate_all, column_separator, column_enclosure, tokenizer_option):
+ """Generates dictionary of keywords for a given model using tab-delimited text file that contains entity IDs and synonyms. Typically, for a given model it is the same file `make_recognizer()` function is processing.
+ Returns dict object can be plugged into model.
+
+ Args:
+ Model *model*: Model instance to use
+ str *filename*: path and name of tab-delimited text file with the content
+ dict *specs*: specifications for columns in the text file
+ dict *line_numbers*: dict that maps line numbers from the text file to internally generated entity IDs
+ str *word_separator*: string considered to be the word delimiter
+ bool *disambiguate_all*: whether generate keywords for all entities or only for those having conflicting synonyms
+ str *column_separator*: delimiter to split columns
+ str *column_enclosure*: any string that columns are supposed to be trimmed of
+ int *tokenizer_option*: tokenizer mode (see documentation for normalization for details)
+ """
self.logger('Making keywords using %s... ' % (filename))
self.push_message('Making keywords from {0}'.format(filename), self.callback_status)
total_bytes = os.path.getsize(filename) + 1
@@ -212,6 +240,21 @@ def make_keywords(self, model, filename, specs, line_numbers, word_separator, di
return keywords
def compile_model(self, model, filename, specs, word_separator, column_separator, column_enclosure, compressed=True, item_limit=0, tokenizer_option=0, include_keywords=False, disambiguate_all=False):
+ """Populates given Model instance with tries and keywords.
+
+ Args:
+ Model *model*: Model instance to populate
+ str *filename*: path and name of tab-delimited text file with the content
+ dict *specs*: specifications for columns in the text file
+ str *word_separator*: string considered to be the word delimiter
+ str *column_separator*: delimiter to split columns
+ str *column_enclosure*: any string that columns are supposed to be trimmed of
+ bool *compressed*: whether given tries must be compressed
+ int *item_limit*: maximum number of rows to stuff in a single trie of a model
+ int *tokenizer_option*: tokenizer mode (see documentation for normalization for details)
+ bool *include_keywords*: whether generate keywords at all or not
+ bool *disambiguate_all*: whether generate keywords for all entities or only for those having conflicting synonyms
+ """
tries, line_numbers = self.make_recognizer(model, filename, specs, word_separator, item_limit, compressed, column_separator, column_enclosure, tokenizer_option)
keywords = {model.CONTENT_KEY: {}, model.INTERNAL_ID_KEY: {}}
if include_keywords:
@@ -221,7 +264,13 @@ def compile_model(self, model, filename, specs, word_separator, column_separator
return True
def unpack_trie(self, model, packed_trie, compressed):
- """TODO: add docstring here
+ """Unpacks compressed trie.
+ Returns dict object representing unpacked trie.
+
+ Args:
+ Model *model*: Model instance to use
+ dict *packed_trie*: trie to process
+ bool *compressed*: whether given trie is already compressed
"""
if not compressed or len(packed_trie) != 1:
return packed_trie
@@ -240,6 +289,17 @@ def unpack_trie(self, model, packed_trie, compressed):
return unpacked_trie
def unpack_attributes(self, cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query):
+ """Loads attributes for internal IDs found in a leaf of a trie from a model's database using associated sqlite3.connect.cursor object.
+ Returns dict object that maps internal IDs with attributes.
+
+ Args:
+ sqlite3.connect.cursor *cur*: cursor to use for throwing queries
+ list *leaf_ids*: internal IDs found in a trie leaf
+ str *include_query*: part of SQL query to filter something in
+ str *exclude_query*: part of SQL query to filter something out
+ bint *process_exclude*: whether use *exclude_query* at all
+ str *attrs_out_query*: part of SQL query that specifies which attributes to eventually return
+ """
attributes = {}
include_attrs = set()
exclude_attrs = set()
@@ -265,12 +325,16 @@ def unpack_attributes(self, cur, leaf_ids, include_query, exclude_query, process
return attributes
def check_attrs(self, model, trie_leaf, cur, include_query, exclude_query, process_exclude, attrs_out_query):
+ """
+ """
trie_leaf[model.ATTRS_KEY] = self.unpack_attributes(cur, trie_leaf[model.ENTITY_KEY], include_query, exclude_query, process_exclude, attrs_out_query)
if int(len(trie_leaf[model.ATTRS_KEY])) == 0:
return {}
return trie_leaf
def spot_entities(self, model, source_string, normalizer_name, include_query='', exclude_query='', process_exclude=False, attrs_out_query='', progress_from=0, progress_to=100):
+ """
+ """
# TODO: review for refactoring
self.logger('Analyzing "%s"... ' % (source_string))
rets = []
@@ -366,6 +430,8 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
return rets
def disambiguate(self, model, recognized, srcs, word_separator):
+ """
+ """
_recognized = sorted(recognized, key=lambda x: x[2])
id_list = [[model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] for x in rec[0] if x in model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY]] for rec in _recognized]
for k in range(len(id_list)):
@@ -407,6 +473,8 @@ def disambiguate(self, model, recognized, srcs, word_separator):
return _recognized
def flatten_layers(self, model, layers):
+ """
+ """
spans = {}
srcs = []
for i in range(0, len(layers)):
@@ -447,6 +515,8 @@ def flatten_layers(self, model, layers):
return ret
def flatten_spans(self, spans):
+ """
+ """
ret = {}
all_entries = []
for span in spans:
@@ -474,6 +544,8 @@ def flatten_spans(self, spans):
return ret
def reduce_spans(self, segments):
+ """
+ """
def intersects(segment1, segment2):
return segment2[0] >= segment1[0] and segment2[0] <= segment1[1]
def length(segment):
@@ -500,6 +572,8 @@ def length(segment):
return ret
def parse(self, model, source_string, attrs_where=None, attrs_out=None):
+ """
+ """
attributes = attrs_where
if attributes is None:
attributes = {}
From d3a874bdea249cc9ffa5e0957c5c216a5dab77f1 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 26 Sep 2020 01:31:31 -0400
Subject: [PATCH 077/116] docstrings
---
pilsner/model.py | 6 +-
pilsner/utility.py | 135 +++++++++++++++++++++++++++++++++++++++++----
2 files changed, 127 insertions(+), 14 deletions(-)
diff --git a/pilsner/model.py b/pilsner/model.py
index ca2d94d..8f7ab1d 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -78,7 +78,7 @@ def save(self, filename):
Args:
str *filename*: path and filename prefix for names of files that will be written.
-
+
Example: model.save('filename') will write the following files:
filename.normalizers
filename.*.dictionary (can be multiple files, depends on model settings)
@@ -117,7 +117,7 @@ def load(self, filename):
Args:
str *filename*: path and filename prefix for names of files that represent the model on disk.
-
+
Example: model.load('filename') will attempt reading following files:
filename.normalizers
filename.*.dictionary
@@ -307,7 +307,7 @@ def get_dictionary_synonym(self, columns, specs, word_separator, tokenizer_optio
def next_trie(self, specs, compressed, tokenizer_option, word_separator):
"""Creates and returns dict that contains empty trie and metadata.
-
+
Args:
dict *specs*: specs for columns in a file that is supposed to be being read for trie construction
bool *compressed*: whether constructed trie(s) must be compressed
diff --git a/pilsner/utility.py b/pilsner/utility.py
index a1d52c2..401aa01 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -46,7 +46,7 @@ def compile_dict_specs(self, fields):
Args:
list *fields*: list of fields (columns)
-
+
Each member of *fields* list must be a dict as follows: {
'name': 'str name of attribute',
'include': bool True for including this column else False,
@@ -83,8 +83,8 @@ def insert_node(self, label, label_id, entity_id, subtrie, specs, columns, model
dict *specs*: dictionary specifications
list *columns*: list of values associated with the entity
*model*: instance of Model class handling the trie and metadata
-
- NB: only works with uncompressed trie!
+
+ NB: only works with uncompressed trie.
"""
for character in label:
if character not in subtrie:
@@ -101,7 +101,7 @@ def remove_node(self, model, label, subtrie, prev_length=0):
dict *subtrie*: object representing the trie
int *pref_length*: length of substring found in the trie (used with recursion)
- NB: only works with uncompressed trie!
+ NB: only works with uncompressed trie.
"""
if label:
head, tail = label[0], label[1:]
@@ -325,7 +325,16 @@ def unpack_attributes(self, cur, leaf_ids, include_query, exclude_query, process
return attributes
def check_attrs(self, model, trie_leaf, cur, include_query, exclude_query, process_exclude, attrs_out_query):
- """
+ """Attaches attributes to a given trie leaf and returns it.
+
+ Args:
+ Model *model*: Model instance to use
+ dict *trie_leaf*: terminal node of a trie to attach attributes to
+ sqlite3.connect.cursor *cur*: cursor to use for throwing queries
+ str *include_query*: part of SQL query to filter something in
+ str *exclude_query*: part of SQL query to filter something out
+ bint *process_exclude*: whether use *exclude_query* at all
+ str *attrs_out_query*: part of SQL query that specifies which attributes to eventually return
"""
trie_leaf[model.ATTRS_KEY] = self.unpack_attributes(cur, trie_leaf[model.ENTITY_KEY], include_query, exclude_query, process_exclude, attrs_out_query)
if int(len(trie_leaf[model.ATTRS_KEY])) == 0:
@@ -333,7 +342,32 @@ def check_attrs(self, model, trie_leaf, cur, include_query, exclude_query, proce
return trie_leaf
def spot_entities(self, model, source_string, normalizer_name, include_query='', exclude_query='', process_exclude=False, attrs_out_query='', progress_from=0, progress_to=100):
- """
+ """Zooms through a string, finds boundaries of synonyms stored in model's trie, and pulls associated attributes from the storage.
+ Returns list(list(tuple *datapoint*)) where datapoint is tuple(list *ids*, dict *attributes*, str *found_synonym*, int *begin*, int *end*) where *ids* are internal IDs of entities, *attributes* is dict {id_entity: {attribute: [value]}}, *found_synonym* is identified substring, *begin* and *end* are indexes of first and last character of recognized substring.
+
+ Args:
+ Model *model*: Model instance to use
+ str *source_string*: string to parse
+ str *normalizer_name*: name of normalization unit (used to pick the right trie from the model; supposed to match normalization unit applied to *source_string*)
+ str *include_query*: part of SQL query to filter something in
+ str *exclude_query*: part of SQL query to filter something out
+ bint *process_exclude*: whether use *exclude_query* at all
+ str *attrs_out_query*: part of SQL query that specifies which attributes to eventually return
+ int *progress_from*: initial progress value to report
+ int *progress_to*: maximum progress value to report
+
+ Data structure for returbed value:
+ [
+ (
+ [int internal_ids],
+ {
+ int internal_id: {str attribute_name: [str attribute_value]}
+ },
+ str identified_label,
+ int unmapped_begin,
+ int unmapped_end
+ )
+ ]
"""
# TODO: review for refactoring
self.logger('Analyzing "%s"... ' % (source_string))
@@ -430,7 +464,30 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
return rets
def disambiguate(self, model, recognized, srcs, word_separator):
- """
+ """For a list of identified datapoints, weighs context of identified labels that belong to more than 1 entity and keeps heaviest ones.
+ Returns filtered list of identified datapoints.
+
+ Args:
+ Model *model*: Model instance to use
+ list *recognized*: identified datapoints
+ list *srcs*: list of all normalized values of original string (using all normalization units applied)
+ str *word_separator*: string to be considered a word separator
+
+ Data structure for *recognized* (input) and for returned value:
+ [
+ (
+ [int internal_ids],
+ {
+ int intenal_id: {str attribute_name: [str attribute_value]}
+ },
+ int mapped_begin,
+ int mapped_end,
+ [int indexes_in_srcs],
+ [
+ (int unmapped_begin, int unmapped_end)
+ ]
+ )
+ ]
"""
_recognized = sorted(recognized, key=lambda x: x[2])
id_list = [[model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY][x] for x in rec[0] if x in model[model.KEYWORDS_KEY][model.INTERNAL_ID_KEY]] for rec in _recognized]
@@ -473,7 +530,42 @@ def disambiguate(self, model, recognized, srcs, word_separator):
return _recognized
def flatten_layers(self, model, layers):
- """
+ """Flattens list of lists of identified datapoints, invokes disambiguation, remaps label locations to the original string, reshapes the output.
+ Returns list(tuple *datapoint*) where *datapoint* is tuple(list *ids*, dict *attributes*, int *begin*, int *end*).
+
+ Args:
+ Model *model*: Model instance to use
+ list *layers*: list of identified datapoints
+
+ Data structure for *layers* (input):
+ [
+ (
+ (
+ [int normalized_positions], # indexes are original positions
+ [[int min_original_position, int max_original_position]], # indexes are normalized positions
+ ),
+ [
+ (
+ [int internal_ids],
+ {int internal_id: {str attribute_name: [str attribute_value]}},
+ str identified_label,
+ int unmapped_begin,
+ int unmapped_end
+ )
+ ],
+ str parsed_normalized_string
+ )
+ ]
+
+ Returned data structure:
+ [
+ (
+ [int internal_ids],
+ {int internal_id: {str attribute_name: [str attribute_value]}},
+ int mapped_begin,
+ int mapped_end
+ )
+ ]
"""
spans = {}
srcs = []
@@ -515,7 +607,11 @@ def flatten_layers(self, model, layers):
return ret
def flatten_spans(self, spans):
- """
+ """Transforms list of normalized tuples into one dict.
+ Returns dict {(int *begin*, int *end*): {str *attribute_name*: {str *attribute_value*}}}.
+
+ Args:
+ list *spans*: list of identified datapoints, as returned by flatten_layers() function
"""
ret = {}
all_entries = []
@@ -544,7 +640,11 @@ def flatten_spans(self, spans):
return ret
def reduce_spans(self, segments):
- """
+ """Reduces overlapping segments by keeping longer ones or leftmost ones in case of equal length.
+ Returnes reduced list of tuples [(int *begin*, int *end*)].
+
+ Args:
+ set *segments*: set of tuples(int *begin*, int *end*)
"""
def intersects(segment1, segment2):
return segment2[0] >= segment1[0] and segment2[0] <= segment1[1]
@@ -572,7 +672,20 @@ def length(segment):
return ret
def parse(self, model, source_string, attrs_where=None, attrs_out=None):
- """
+ """Wraps around all functions that normalize string, spot entities, disambiguate, and post-process the output.
+ Returns dict {(int *begin*, int *end*): {str *attribute_name*: {str attribute_value}}}.
+
+ Args:
+ Model *model*: Model instance to use
+ str *source_string*: source string to parse
+ dict *attrs_where*: specifications for filtering model's data used for recognition
+ list *attrs_out*: list of attribute names to output
+
+ Data structure for *attrs_where*:
+ {
+ '+': {str attribute_name: {str attribute_value}}, # if indicated, only entities that have these attributes will be considered
+ '-': {str attribute_name: {str attribute_value}} # if indicated, entities that have these attributes will not be considered
+ }
"""
attributes = attrs_where
if attributes is None:
From df0967529342e52bc4e28e9c1f0e75490d5ced23 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Mon, 28 Sep 2020 11:12:48 -0400
Subject: [PATCH 078/116] plan for readme
---
README.md | 20 +++++++++++++++++++-
1 file changed, 19 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index cc5e429..b59b0c4 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,20 @@
# pilsner
-Utility for dictionary-based named entity recognition
+
+Python implemented library servicing named entity recognition
+
+[![pypi][pypi-img]][pypi-url]
+
+[pypi-img]: https://img.shields.io/pypi/v/pilsner?style=plastic
+[pypi-url]: https://pypi.org/project/pilsner/
+
+## Purpose
+
+Blah
+
+## Installation and dependencies
+
+Blah
+
+## Usage
+
+Blah
From 6f7d80d6e4d00025fa2eb738ab31e55439573cb1 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Mon, 28 Sep 2020 23:38:49 -0400
Subject: [PATCH 079/116] readme
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index b59b0c4..707406d 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Python implemented library servicing named entity recognition
## Purpose
-Blah
+This library is intended to store any thesaurus in a trie-like structure and identify any of stored synonyms in a string.
## Installation and dependencies
From 58368ad28de5707bdfd3be6aee577d598d5002cb Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 29 Sep 2020 00:07:37 -0400
Subject: [PATCH 080/116] readme - installation and dependencies
---
README.md | 24 ++++++++++++++++++++++--
1 file changed, 22 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 707406d..c41a5ab 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,32 @@ Python implemented library servicing named entity recognition
## Purpose
-This library is intended to store any thesaurus in a trie-like structure and identify any of stored synonyms in a string.
+This library is Python implementation of toolkit for dictionary based named entity recognition. It is intended to store any thesaurus in a trie-like structure and identify any of stored synonyms in a string.
## Installation and dependencies
-Blah
+```bash
+pip install pilsner
+```
+
+`pilsner` is tested in Python 3.6, 3.7, and 3.8.
+
+The only dependency is `sic` package. While it can be automatically installed at the time of `pilsner` installation, manual installation of `sic` beforehand might also be considered (see benchmark of cythonized vs pure Python implementation in `sic` docimentation, [https://pypi.org/project/sic/](https://pypi.org/project/sic/)).
## Usage
+### Compile model
+
+Blah
+
+### Save model
+
+Blah
+
+### Load model
+
+Blah
+
+### Parse string
+
Blah
From bd9b78d11066ee5c001ac1599eb32893ac5c7cec Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 29 Sep 2020 23:34:26 -0400
Subject: [PATCH 081/116] readme
---
README.md | 46 +++++++++++++++++++++++++++++++++++++++-------
1 file changed, 39 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index c41a5ab..2ac8382 100644
--- a/README.md
+++ b/README.md
@@ -7,11 +7,11 @@ Python implemented library servicing named entity recognition
[pypi-img]: https://img.shields.io/pypi/v/pilsner?style=plastic
[pypi-url]: https://pypi.org/project/pilsner/
-## Purpose
+## 1. Purpose
This library is Python implementation of toolkit for dictionary based named entity recognition. It is intended to store any thesaurus in a trie-like structure and identify any of stored synonyms in a string.
-## Installation and dependencies
+## 2. Installation and dependencies
```bash
pip install pilsner
@@ -21,20 +21,52 @@ pip install pilsner
The only dependency is `sic` package. While it can be automatically installed at the time of `pilsner` installation, manual installation of `sic` beforehand might also be considered (see benchmark of cythonized vs pure Python implementation in `sic` docimentation, [https://pypi.org/project/sic/](https://pypi.org/project/sic/)).
-## Usage
+## 3. Diagram
-### Compile model
+Image
+
+## 4. Usage
+
+```python
+import pilsner
+```
+
+### 4.1. Initialize model
+
+```python
+m = pilsner.Model()
+```
+
+### 4.2. Add string normalization units
+
+String normalization is done by `sic` component.
+
+### 4.3. Add dictionary
+
+Blah
+
+### 4.4. Initialize utility
+
+```python
+r = pilsner.Recognizer()
+```
+
+### 4.5. Compile model
+
+Blah
+
+### 4.6. Save model
Blah
-### Save model
+### 4.7. Load model
Blah
-### Load model
+### 4.8. Parse string
Blah
-### Parse string
+## 5. Example
Blah
From 8a85827a45226503ebcb301e88ea7ed3ca720e6d Mon Sep 17 00:00:00 2001
From: pgolo
Date: Wed, 30 Sep 2020 00:17:04 -0400
Subject: [PATCH 082/116] readme (sample code)
---
README.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 64 insertions(+)
diff --git a/README.md b/README.md
index 2ac8382..43e32db 100644
--- a/README.md
+++ b/README.md
@@ -29,6 +29,70 @@ Image
```python
import pilsner
+
+m = pilsner.Model()
+m.add_normalizer('default', 'default_normalizer.xml')
+m.add_normalizer('custom', 'custom_normalizer.xml')
+m.normalizer_map = {
+ 'animal': 'default',
+ 'plant': 'custom'
+}
+r = pilsner.Recognizer()
+ fields = [
+ {
+ 'name': 'type',
+ 'include': True,
+ 'delimiter': None,
+ 'id_flag': False,
+ 'normalizer_flag': True,
+ 'value_flag': False
+ },
+ {
+ 'name': 'id',
+ 'include': True,
+ 'delimiter': None,
+ 'id_flag': True,
+ 'normalizer_flag': False,
+ 'value_flag': False
+ },
+ {
+ 'name': 'label',
+ 'include': True,
+ 'delimiter': None,
+ 'id_flag': False,
+ 'normalizer_flag': False,
+ 'value_flag': True
+ },
+ {
+ 'name': 'habitat',
+ 'include': True,
+ 'delimiter': ',',
+ 'id_flag': False,
+ 'normalizer_flag': False,
+ 'value_flag': False
+ }
+ ]
+ specs = r.compile_dict_specs(fields)
+ r.compile_model(
+ model=m,
+ filename='living_things.txt',
+ specs=specs,
+ word_separator=' ',
+ column_separator='\t',
+ column_enclosure='\n',
+ include_keywords=True
+ )
+ m.save('living_things')
+ m = pilsner.Model('living_things')
+ text_to_parse = 'sample text here'
+ parsed = r.parse(
+ model=m,
+ source_string=text_to_parse,
+ attrs_where={
+ '+': {'habitat': {'air', 'ocean'}}
+ },
+ attrs_out=['id', 'type', 'habitat']
+ )
```
### 4.1. Initialize model
From 941d087932e9c70e37b91e026bd44c9de4e2b516 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Wed, 30 Sep 2020 22:47:59 -0400
Subject: [PATCH 083/116] don't try spotting entities with empty model
---
pilsner/utility.py | 2 ++
1 file changed, 2 insertions(+)
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 401aa01..e4c5ed1 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -375,6 +375,8 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
this_progress_position = 0
last_progress_position = 0
total_tries = len(model[model.DICTIONARY_KEY])
+ if total_tries == 0:
+ return rets
progress_share = progress_to - progress_from
trie_increment = int(progress_share / total_tries)
current_trie_index = 0
From 1668a03ed6bdcd221a9f6c0d1f1377251382068a Mon Sep 17 00:00:00 2001
From: pgolo
Date: Wed, 30 Sep 2020 22:48:21 -0400
Subject: [PATCH 084/116] don't save anything if model is empty
---
pilsner/model.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/pilsner/model.py b/pilsner/model.py
index 8f7ab1d..222972a 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -87,6 +87,9 @@ def save(self, filename):
"""
assert os.path.exists(self[self.DATASOURCE_KEY]), 'Cannot find temporary database on disk'
logging.debug('Saving model "%s"' % (filename))
+ if len(self[self.DICTIONARY_KEY]) == 0:
+ logging.warning('Model is empty, nothing saved')
+ return False
self.cursor.close()
self.connection.close()
normalizers = {
From 0b554046cb1ed0e8ea47f4d453a0f0197e1af776 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Wed, 30 Sep 2020 22:48:44 -0400
Subject: [PATCH 085/116] example (wip)
---
example/example.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 89 insertions(+)
create mode 100644 example/example.py
diff --git a/example/example.py b/example/example.py
new file mode 100644
index 0000000..831d964
--- /dev/null
+++ b/example/example.py
@@ -0,0 +1,89 @@
+import sys; sys.path.insert(0, '')
+
+# Import pilsner
+import pilsner
+
+# Initialize Model class
+m = pilsner.Model()
+
+# Add normalization units
+#m.add_normalizer('default', 'example/default_normalizer.xml')
+#m.add_normalizer('custom', 'example/custom_normalizer.xml')
+
+# Map names of normalization units to some string values
+m.normalizer_map = {
+ 'animal': 'default',
+ 'plant': 'custom'
+}
+
+# Initialize Recognizer class
+r = pilsner.Recognizer()
+
+# Provide table definition for example/living_fileds.txt file
+fields = [
+ {
+ 'name': 'type', # attribute name is 'type'
+ 'include': True, # include this column
+ 'delimiter': None, # no delimiter (single value per row)
+ 'id_flag': False, # entity IDs are not in this column
+ 'normalizer_flag': True, # tags for normalization units are in this column
+ 'value_flag': False # string labels (synonyms) are not in this column
+ },
+ {
+ 'name': 'id', # attribute name is 'id'
+ 'include': True,
+ 'delimiter': None,
+ 'id_flag': True, # entity IDs are in this column
+ 'normalizer_flag': False,
+ 'value_flag': False
+ },
+ {
+ 'name': 'label', # attribute name is 'label'
+ 'include': True,
+ 'delimiter': None,
+ 'id_flag': False,
+ 'normalizer_flag': False,
+ 'value_flag': True # string labels (synonyms) are in this column
+ },
+ {
+ 'name': 'habitat', # attribute name is 'habitat'
+ 'include': True,
+ 'delimiter': ',', # multiple values delimited with ',' can be stored in a single row
+ 'id_flag': False,
+ 'normalizer_flag': False,
+ 'value_flag': False
+ }
+]
+
+# Convert table definition into internally used `specs` structure
+specs = r.compile_dict_specs(fields)
+
+# Populate Model instance with data from example/living_things.txt file
+r.compile_model(
+ model=m,
+ filename='example/living_things.txt',
+ specs=specs,
+ word_separator=' ',
+ column_separator='\t',
+ column_enclosure='\n',
+ include_keywords=True
+)
+
+# Save Model instance to disk
+#m.save('living_things')
+
+# Load Model instance from disk
+#m = pilsner.Model('living_things')
+
+# Parse string
+text_to_parse = 'sample text here a b c d c b a'
+parsed = r.parse(
+ model=m,
+ source_string=text_to_parse,
+ attrs_where={
+ '+': {'habitat': {'air', 'ocean'}}
+ },
+ attrs_out=['id', 'type', 'habitat']
+)
+
+print(parsed)
From b21d81faa3906b99d913ee945969ad692a0bbbd3 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Wed, 30 Sep 2020 22:49:19 -0400
Subject: [PATCH 086/116] example (wip)
---
example/custom_normalizer.xml | 0
example/default_normalizer.xml | 0
example/living_things.txt | 1 +
3 files changed, 1 insertion(+)
create mode 100644 example/custom_normalizer.xml
create mode 100644 example/default_normalizer.xml
create mode 100644 example/living_things.txt
diff --git a/example/custom_normalizer.xml b/example/custom_normalizer.xml
new file mode 100644
index 0000000..e69de29
diff --git a/example/default_normalizer.xml b/example/default_normalizer.xml
new file mode 100644
index 0000000..e69de29
diff --git a/example/living_things.txt b/example/living_things.txt
new file mode 100644
index 0000000..593d58d
--- /dev/null
+++ b/example/living_things.txt
@@ -0,0 +1 @@
+a b c air
From 750883500ea74cfb9de337fd3e658dc4aa69b083 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 1 Oct 2020 00:15:18 -0400
Subject: [PATCH 087/116] readme (wip)
---
README.md | 126 ++++++++++++++++++++++++++++++------------------------
1 file changed, 71 insertions(+), 55 deletions(-)
diff --git a/README.md b/README.md
index 43e32db..c3635ee 100644
--- a/README.md
+++ b/README.md
@@ -38,69 +38,85 @@ m.normalizer_map = {
'plant': 'custom'
}
r = pilsner.Recognizer()
- fields = [
- {
- 'name': 'type',
- 'include': True,
- 'delimiter': None,
- 'id_flag': False,
- 'normalizer_flag': True,
- 'value_flag': False
- },
- {
- 'name': 'id',
- 'include': True,
- 'delimiter': None,
- 'id_flag': True,
- 'normalizer_flag': False,
- 'value_flag': False
- },
- {
- 'name': 'label',
- 'include': True,
- 'delimiter': None,
- 'id_flag': False,
- 'normalizer_flag': False,
- 'value_flag': True
- },
- {
- 'name': 'habitat',
- 'include': True,
- 'delimiter': ',',
- 'id_flag': False,
- 'normalizer_flag': False,
- 'value_flag': False
- }
- ]
- specs = r.compile_dict_specs(fields)
- r.compile_model(
- model=m,
- filename='living_things.txt',
- specs=specs,
- word_separator=' ',
- column_separator='\t',
- column_enclosure='\n',
- include_keywords=True
- )
- m.save('living_things')
- m = pilsner.Model('living_things')
- text_to_parse = 'sample text here'
- parsed = r.parse(
- model=m,
- source_string=text_to_parse,
- attrs_where={
- '+': {'habitat': {'air', 'ocean'}}
- },
- attrs_out=['id', 'type', 'habitat']
- )
+fields = [
+ {
+ 'name': 'type',
+ 'include': True,
+ 'delimiter': None,
+ 'id_flag': False,
+ 'normalizer_flag': True,
+ 'value_flag': False
+ },
+ {
+ 'name': 'id',
+ 'include': True,
+ 'delimiter': None,
+ 'id_flag': True,
+ 'normalizer_flag': False,
+ 'value_flag': False
+ },
+ {
+ 'name': 'label',
+ 'include': True,
+ 'delimiter': None,
+ 'id_flag': False,
+ 'normalizer_flag': False,
+ 'value_flag': True
+ },
+ {
+ 'name': 'habitat',
+ 'include': True,
+ 'delimiter': ',',
+ 'id_flag': False,
+ 'normalizer_flag': False,
+ 'value_flag': False
+ }
+]
+specs = r.compile_dict_specs(fields)
+r.compile_model(
+ model=m,
+ filename='living_things.txt',
+ specs=specs,
+ word_separator=' ',
+ column_separator='\t',
+ column_enclosure='\n',
+ include_keywords=True
+)
+m.save('living_things')
+m = pilsner.Model('living_things')
+text_to_parse = 'sample text here'
+parsed = r.parse(
+ model=m,
+ source_string=text_to_parse,
+ attrs_where={
+ '+': {'habitat': {'air', 'ocean'}}
+ },
+ attrs_out=['id', 'type', 'habitat']
+)
```
+`pilsner` consists of two major components: `Model` and `Recognizer`. `Model` class provides storage for the dictionary and string normalization rules, as well as methods for populating this storage. `Recognizer` class provides methods for accessing `Model`.
+
### 4.1. Initialize model
+To initialize empty model:
+
```python
m = pilsner.Model()
```
+To specify path to temporary database for empty model:
+
+```python
+m = pilsner.Model(storage_location='path/to/database.file')
+```
+
+To load model from disk:
+
+```python
+m = pilsner.Model(filename='path/to/model')
+```
+
### 4.2. Add string normalization units
String normalization is done by `sic` component.
From 41e16ecdc589c96acec1d0272270fb5b9eed9da3 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 1 Oct 2020 01:26:59 -0400
Subject: [PATCH 088/116] exceptions
---
example/example.py | 4 ++--
pilsner/model.py | 10 ++++++----
pilsner/utility.py | 5 +++++
3 files changed, 13 insertions(+), 6 deletions(-)
diff --git a/example/example.py b/example/example.py
index 831d964..c89d208 100644
--- a/example/example.py
+++ b/example/example.py
@@ -70,10 +70,10 @@
)
# Save Model instance to disk
-#m.save('living_things')
+m.save('living_things')
# Load Model instance from disk
-#m = pilsner.Model('living_things')
+m = pilsner.Model('living_things')
# Parse string
text_to_parse = 'sample text here a b c d c b a'
diff --git a/pilsner/model.py b/pilsner/model.py
index 222972a..06037bb 100644
--- a/pilsner/model.py
+++ b/pilsner/model.py
@@ -85,11 +85,13 @@ def save(self, filename):
filename.keywords
filename.attributes
"""
- assert os.path.exists(self[self.DATASOURCE_KEY]), 'Cannot find temporary database on disk'
+ try:
+ assert os.path.exists(self[self.DATASOURCE_KEY]), 'Cannot find temporary database on disk'
+ assert len(self[self.DICTIONARY_KEY]) > 0, 'Model is empty, nothing to save'
+ except Exception as e:
+ self.destroy()
+ raise e
logging.debug('Saving model "%s"' % (filename))
- if len(self[self.DICTIONARY_KEY]) == 0:
- logging.warning('Model is empty, nothing saved')
- return False
self.cursor.close()
self.connection.close()
normalizers = {
diff --git a/pilsner/utility.py b/pilsner/utility.py
index e4c5ed1..19a4519 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -716,6 +716,11 @@ def parse(self, model, source_string, attrs_where=None, attrs_out=None):
self.push_message('Parsing text', self.callback_status)
rets = []
total_normalizers = int(len(model[model.NORMALIZER_KEY]))
+ try:
+ assert total_normalizers > 0, 'Model does not have normalization units'
+ except Exception as e:
+ model.destroy()
+ raise e
spot_progress_share = int(100 / total_normalizers)
current_normalizer_index = 0
for normalizer_name in model[model.NORMALIZER_KEY]:
From 871a91f08d84945a06c9063cb0d8aaebbeee3d64 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 2 Oct 2020 00:48:35 -0400
Subject: [PATCH 089/116] moved example to misc
---
{example => misc/example}/custom_normalizer.xml | 0
{example => misc/example}/default_normalizer.xml | 0
{example => misc/example}/example.py | 12 ++++++++----
{example => misc/example}/living_things.txt | 0
4 files changed, 8 insertions(+), 4 deletions(-)
rename {example => misc/example}/custom_normalizer.xml (100%)
rename {example => misc/example}/default_normalizer.xml (100%)
rename {example => misc/example}/example.py (87%)
rename {example => misc/example}/living_things.txt (100%)
diff --git a/example/custom_normalizer.xml b/misc/example/custom_normalizer.xml
similarity index 100%
rename from example/custom_normalizer.xml
rename to misc/example/custom_normalizer.xml
diff --git a/example/default_normalizer.xml b/misc/example/default_normalizer.xml
similarity index 100%
rename from example/default_normalizer.xml
rename to misc/example/default_normalizer.xml
diff --git a/example/example.py b/misc/example/example.py
similarity index 87%
rename from example/example.py
rename to misc/example/example.py
index c89d208..fe7942e 100644
--- a/example/example.py
+++ b/misc/example/example.py
@@ -1,3 +1,6 @@
+# Either install pilsner package to the environment first,
+# or run this from project's root
+
import sys; sys.path.insert(0, '')
# Import pilsner
@@ -58,10 +61,10 @@
# Convert table definition into internally used `specs` structure
specs = r.compile_dict_specs(fields)
-# Populate Model instance with data from example/living_things.txt file
+# Populate Model instance with data from misc/example/living_things.txt file
r.compile_model(
model=m,
- filename='example/living_things.txt',
+ filename='misc/example/living_things.txt',
specs=specs,
word_separator=' ',
column_separator='\t',
@@ -70,10 +73,10 @@
)
# Save Model instance to disk
-m.save('living_things')
+m.save('misc/example/living_things')
# Load Model instance from disk
-m = pilsner.Model('living_things')
+m = pilsner.Model('misc/example/living_things')
# Parse string
text_to_parse = 'sample text here a b c d c b a'
@@ -86,4 +89,5 @@
attrs_out=['id', 'type', 'habitat']
)
+# Print out the result
print(parsed)
diff --git a/example/living_things.txt b/misc/example/living_things.txt
similarity index 100%
rename from example/living_things.txt
rename to misc/example/living_things.txt
From 95678e8f27af2ff05d76e8cb3531f5e0ed71758b Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 2 Oct 2020 00:48:49 -0400
Subject: [PATCH 090/116] diagram in readme
---
README.md | 2 +-
misc/pilsner-diagram.drawio | 1 +
misc/pilsner-diagram.svg | 3 +++
3 files changed, 5 insertions(+), 1 deletion(-)
create mode 100644 misc/pilsner-diagram.drawio
create mode 100644 misc/pilsner-diagram.svg
diff --git a/README.md b/README.md
index c3635ee..2e3e7b8 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ The only dependency is `sic` package. While it can be automatically installed at
## 3. Diagram
-Image
+![Diagram](misc/pilsner-diagram.svg)
## 4. Usage
diff --git a/misc/pilsner-diagram.drawio b/misc/pilsner-diagram.drawio
new file mode 100644
index 0000000..0bb305a
--- /dev/null
+++ b/misc/pilsner-diagram.drawio
@@ -0,0 +1 @@
+7Vzbdps4FP0aPzYLIS72YxK305lkmq4ma7V5xEbBajHyyCKx+/UjgTAgYYNjLm7TlxYOQkJH++xzkZwRvF5u/qLeavEv8VE4Mg1/M4LTkWlODJv/KwTbVOC4ZioIKPZTEcgF9/gnkkJDSmPso3WpISMkZHhVFs5JFKE5K8k8SslLudkTCcujrrwAaYL7uRfq0q/YZ4tUOraNXP4R4WCRjQwM+WTpZY2lYL3wfPJSEMH3I3hNCWHp1XJzjUKhu0wv6Xsf9jzdfRhFEWvyArt+GH98sJ1vX8ybT2vv8dP869M7ILt59sJYznhkOiHv8GpG+VUgruT3s22mFD6Vlbice5R3fvWywAzdr7y5kL1wGHDZgi1Dfgf45TpdVP55/DpTgrHr9xlRhjZ7JwV2quIQQ2SJGN3yJvIFSypXogtY4MKUiHspLJdstSislGNeyIaehEiw6z3XIr+QijxGqcYvrlSoKNXqV6k3aLq8WX6nd/cguPP+/ufDB5u82w9UHz+XNOn8FwubuuIKYO+8EAfRCF7yFt/jNcNP27xBthSStNLe+MclHdatUjxD9as0I3HkI/92thN48x8BFdK7mIU4QlLue/THHe8GM6F044LTZkloJlLRUk5oOueLiSgXhN4MhZ/JGjNMSg8EBjBnslulwRL7vpjOrsGl7JKRlYqmtRg+Cm7Rk1hhCHOEaXCqAN1ehAHHKUEsM+MCvCyjAl/ANTqCF9Tg9QXNSRBxM6O/AAgWhOKfJGJeNkZ7qJgRxsgyx8KDQEkCjmSMq91krklI+AjTiIgZyaeE+ogqTw4hDHSEMBP2CbFqt2BrQEI+DzbkLaFsQQISeeH7XHpFU/BIZeVtbkmyDGKpvyPGtjJy8mJGyuhDG8y+JcBxbXn7KHsT19NN8Wab3UR8wulbpp3dPxYf5u8ld9mLFa5IzPGwI+IqITGdowPmKVXHPBogdqBd6rp0VFAUegw/lz+k9RU2J2e0wqDpCrvdr/AJCyJf/Uww73Fn3qblls3bgGIepV5SUMkXi4Gx2pcNSn1BQ+koRZ3W0SWl3rbQbCUarA98szKOaVgK4tIec/zt9PV6v2Zrfu0ToUvuPziYVKzmSAQNAlAdBycRtmmUVxQ4+YrWRJ1WV0FBluUNY8+gSNe5bdfZc8mYL+zjzfk1/DxpyM/AqaSMU/nhWFMEZjk6mNgHGQK4JzUHwBh1b+jWH6y2jFXrDWJ13ANUJ5pPmuK5SEM8ydlVuZZP5vEyWeuj8622XZVSdKpIXseO7qXGneUVg0adZcuvMXx+8xlRzOctctFDZHCACoCdM4XaW1f5R1bSqyUNaJ0DR2RW3dhDldt3Y/YVZdF7Rnnuz2WRDEq9pBqRlPOfcLCXDFqpkZ7GAYoCLfdi+BKppSn4knENz2KG1pouywwwcLTvNFMe7CzSd4bk0Jw36xP3loKgLBCvJTS77SDoNIg7OodsIxJtl2cO8MnAAHcHBXhHQUJfGQM0GxqLdV7Wkn13wVoe0Ibtdass+a9vU1HCaXNS6UorImq7s80g3ZG2E4qMO1AfmCihiKPnI8DVtWfC07VXnY+8hmoG2Nk4quxdpsKu0oxs572eaAZJM6BtlqEGlSMnSns7i0ir23eTZlj6Vu7eECEzYiw2Rzn27hmhyYGg2hrDvi1cMfNk2zbJVLvIPEzV3u3KQnkm6oUwLZ0wb9D2hVD/99C5pebLZ6Bz1x0yonvtZmN5Oxn0E79ZjbePW9+eOG0/GQy7xKe71Vcs8CC7yeUTb9oOcOOtZKvs8Kq2pdvaTVaGyr65W996zHZyRvRrhgSTrwo5nRAVcrw65n/CG5SdDS5E051QvRpO77BQDKerDg7Brsp67vgP0Tcl+qZVrez4yJkk6pZe1uIL9SNe/SZGBc0Bjao6SYVDGtVR9bD2al/t5KiubmPVKjYHNSlXM6kpXnvLGQ5ij/0u3srad3ZtOMPS1H5gD2qgXPCkX0fA8aC54AFTU46vr+NQL/UetSly+IB324p1DNVJVBaCrQowd6VXvex79k7iuLJmZ+clrKZOoicfUTk4HPR851s6rw0dZZPnhPPa0KnOfNvOsNVx2j6vXQ3JPyWffgBp78lIjkajXf9LhLYAqQzVdsmnGpCD+sC3xJG7n6O1wJGW0Q9HquP0w5GD5u5viCP3lUKP5shJfxw5GYIjB/3txVviSBvALCdsgSVt4Cq9dcWT+kj9MOXAPzl+K0xpG5M6IDXHpNEA4a3BUh3sRL7kt/lftEmb538WCL7/Hw==
\ No newline at end of file
diff --git a/misc/pilsner-diagram.svg b/misc/pilsner-diagram.svg
new file mode 100644
index 0000000..aba93f4
--- /dev/null
+++ b/misc/pilsner-diagram.svg
@@ -0,0 +1,3 @@
+
+
+
\ No newline at end of file
From c097d6c382d7afcf282629a67b19587ef293b4e8 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 2 Oct 2020 01:19:06 -0400
Subject: [PATCH 091/116] path adjustments for the example
---
misc/example/example.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/misc/example/example.py b/misc/example/example.py
index fe7942e..3fc8189 100644
--- a/misc/example/example.py
+++ b/misc/example/example.py
@@ -10,8 +10,8 @@
m = pilsner.Model()
# Add normalization units
-#m.add_normalizer('default', 'example/default_normalizer.xml')
-#m.add_normalizer('custom', 'example/custom_normalizer.xml')
+#m.add_normalizer('default', 'misc/example/default_normalizer.xml')
+#m.add_normalizer('custom', 'misc/example/custom_normalizer.xml')
# Map names of normalization units to some string values
m.normalizer_map = {
@@ -22,7 +22,7 @@
# Initialize Recognizer class
r = pilsner.Recognizer()
-# Provide table definition for example/living_fileds.txt file
+# Provide table definition for misc/example/living_fileds.txt file
fields = [
{
'name': 'type', # attribute name is 'type'
From e1c812306c6b8fd7dac43faf436c898f9104e94b Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 2 Oct 2020 21:27:32 -0400
Subject: [PATCH 092/116] readme
---
README.md | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 2e3e7b8..1c72689 100644
--- a/README.md
+++ b/README.md
@@ -23,10 +23,16 @@ The only dependency is `sic` package. While it can be automatically installed at
## 3. Diagram
+`pilsner` consists of two major components: `Model` and `Recognizer`. `Model` class provides storage for the dictionary and string normalization rules, as well as methods for populating this storage. `Recognizer` class provides methods for accessing `Model`.
+
![Diagram](misc/pilsner-diagram.svg)
## 4. Usage
+```python
+import pilsner
+```
+
```python
import pilsner
@@ -95,8 +101,6 @@ parsed = r.parse(
)
```
-`pilsner` consists of two major components: `Model` and `Recognizer`. `Model` class provides storage for the dictionary and string normalization rules, as well as methods for populating this storage. `Recognizer` class provides methods for accessing `Model`.
-
### 4.1. Initialize model
To initialize empty model:
@@ -119,7 +123,7 @@ m = pilsner.Model(filename='path/to/model')
### 4.2. Add string normalization units
-String normalization is done by `sic` component.
+String normalization is technically done by `sic` component.
### 4.3. Add dictionary
From 0199e74b43fda91a6d74c20817be049f16d79836 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Fri, 2 Oct 2020 21:28:07 -0400
Subject: [PATCH 093/116] readme
---
README.md | 14 +++++++++++---
1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 2e3e7b8..2af88c0 100644
--- a/README.md
+++ b/README.md
@@ -99,24 +99,32 @@ parsed = r.parse(
### 4.1. Initialize model
-To initialize empty model:
+- To initialize empty model:
```python
m = pilsner.Model()
```
-To specify path to temporary database for empty model:
+- To specify path to temporary database for empty model:
```python
+# For temporary database to be created on disk:
m = pilsner.Model(storage_location='path/to/database.file')
+
+# For temporary database to be created in memory:
+m = pilsner.Model(storage_location=':memory:')
```
-To load model from disk:
+> If database is created in memory, the model cannot be later saved on disk (can only be used instantly).
+
+- To load model from disk:
```python
m = pilsner.Model(filename='path/to/model')
```
+> More on how model is loaded from disk - see [4.7. Load model](#47-load-model).
+
### 4.2. Add string normalization units
String normalization is done by `sic` component.
From 2c2b83c1f06ee1cac1eb72f0b0824dfb6c4a24eb Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 3 Oct 2020 00:56:46 -0400
Subject: [PATCH 094/116] readme
---
README.md | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 6ce6d53..5167877 100644
--- a/README.md
+++ b/README.md
@@ -127,11 +127,17 @@ m = pilsner.Model(storage_location=':memory:')
m = pilsner.Model(filename='path/to/model')
```
-> More on how model is loaded from disk - see [4.7. Load model](#47-load-model).
+> More on how model is saved to and loaded from disk - see [4.6. Save model](#46-save-model) and [4.7. Load model](#47-load-model).
### 4.2. Add string normalization units
-String normalization is technically done by `sic` component.
+```python
+# Assuming m is pilsner.Model instance:
+m.add_normalizer(
+ normalizer_name='normalizer_tag',
+ filename='path/to/normalizer_config.xml'
+)
+```
### 4.3. Add dictionary
From 6418d90f51911fa494e00b14e92b33b96f0b3752 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sat, 3 Oct 2020 00:59:23 -0400
Subject: [PATCH 095/116] readme
---
README.md | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/README.md b/README.md
index 5167877..ca8a8cb 100644
--- a/README.md
+++ b/README.md
@@ -112,10 +112,12 @@ m = pilsner.Model()
- To specify path to temporary database for empty model:
```python
-# For temporary database to be created on disk:
m = pilsner.Model(storage_location='path/to/database.file')
+```
+
+- To create empty model that uses database created in memory rather than on disk:
-# For temporary database to be created in memory:
+```python
m = pilsner.Model(storage_location=':memory:')
```
From b487b13277629fc5eb4c276ec84f0ca68356a039 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 4 Oct 2020 00:12:33 -0400
Subject: [PATCH 096/116] readme (add normalizers)
---
README.md | 16 +++++++++++++---
1 file changed, 13 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index ca8a8cb..6c4e82f 100644
--- a/README.md
+++ b/README.md
@@ -133,6 +133,11 @@ m = pilsner.Model(filename='path/to/model')
### 4.2. Add string normalization units
+Depending on the dictionary and nature of the text supposed to be parsed, string normalization might not be required at all, and nothing specific is to be done in such case.
+
+- Without string normalization, synonyms from the dictionary will be stored as they are and looked up by recognizer case-sensitively.
+- To add a single normalization unit:
+
```python
# Assuming m is pilsner.Model instance:
m.add_normalizer(
@@ -141,16 +146,21 @@ m.add_normalizer(
)
```
-### 4.3. Add dictionary
+> String normalization is technically done by `sic` component. See documentation for `sic` at [https://pypi.org/project/sic/](https://pypi.org/project/sic/) to learn how to design normalizer config.
-Blah
+- Model can embed more than one normalization unit.
+- Default normalization unit for the model is the one added first or the last one added with parameter `default` set to `True`.
-### 4.4. Initialize utility
+### 4.3. Initialize utility
```python
r = pilsner.Recognizer()
```
+### 4.4. Add dictionary
+
+Blah
+
### 4.5. Compile model
Blah
From 352119f36379223c42b84ecfc9eaf1d011045cce Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 4 Oct 2020 22:07:58 -0400
Subject: [PATCH 097/116] renamed Recognizer to Utility
---
misc/example/example.py | 4 +--
misc/pilsner-diagram.drawio | 2 +-
misc/pilsner-diagram.svg | 2 +-
pilsner/__init__.py | 2 +-
pilsner/utility.pxd | 2 +-
pilsner/utility.py | 8 ++---
test/performance.py | 16 +++++-----
test/sandbox.py | 8 ++---
test/ut_utility.py | 62 ++++++++++++++++++-------------------
9 files changed, 53 insertions(+), 53 deletions(-)
diff --git a/misc/example/example.py b/misc/example/example.py
index 3fc8189..e512954 100644
--- a/misc/example/example.py
+++ b/misc/example/example.py
@@ -19,8 +19,8 @@
'plant': 'custom'
}
-# Initialize Recognizer class
-r = pilsner.Recognizer()
+# Initialize Utility class
+r = pilsner.Utility()
# Provide table definition for misc/example/living_fileds.txt file
fields = [
diff --git a/misc/pilsner-diagram.drawio b/misc/pilsner-diagram.drawio
index 0bb305a..1a460c1 100644
--- a/misc/pilsner-diagram.drawio
+++ b/misc/pilsner-diagram.drawio
@@ -1 +1 @@
-7Vzbdps4FP0aPzYLIS72YxK305lkmq4ma7V5xEbBajHyyCKx+/UjgTAgYYNjLm7TlxYOQkJH++xzkZwRvF5u/qLeavEv8VE4Mg1/M4LTkWlODJv/KwTbVOC4ZioIKPZTEcgF9/gnkkJDSmPso3WpISMkZHhVFs5JFKE5K8k8SslLudkTCcujrrwAaYL7uRfq0q/YZ4tUOraNXP4R4WCRjQwM+WTpZY2lYL3wfPJSEMH3I3hNCWHp1XJzjUKhu0wv6Xsf9jzdfRhFEWvyArt+GH98sJ1vX8ybT2vv8dP869M7ILt59sJYznhkOiHv8GpG+VUgruT3s22mFD6Vlbice5R3fvWywAzdr7y5kL1wGHDZgi1Dfgf45TpdVP55/DpTgrHr9xlRhjZ7JwV2quIQQ2SJGN3yJvIFSypXogtY4MKUiHspLJdstSislGNeyIaehEiw6z3XIr+QijxGqcYvrlSoKNXqV6k3aLq8WX6nd/cguPP+/ufDB5u82w9UHz+XNOn8FwubuuIKYO+8EAfRCF7yFt/jNcNP27xBthSStNLe+MclHdatUjxD9as0I3HkI/92thN48x8BFdK7mIU4QlLue/THHe8GM6F044LTZkloJlLRUk5oOueLiSgXhN4MhZ/JGjNMSg8EBjBnslulwRL7vpjOrsGl7JKRlYqmtRg+Cm7Rk1hhCHOEaXCqAN1ehAHHKUEsM+MCvCyjAl/ANTqCF9Tg9QXNSRBxM6O/AAgWhOKfJGJeNkZ7qJgRxsgyx8KDQEkCjmSMq91krklI+AjTiIgZyaeE+ogqTw4hDHSEMBP2CbFqt2BrQEI+DzbkLaFsQQISeeH7XHpFU/BIZeVtbkmyDGKpvyPGtjJy8mJGyuhDG8y+JcBxbXn7KHsT19NN8Wab3UR8wulbpp3dPxYf5u8ld9mLFa5IzPGwI+IqITGdowPmKVXHPBogdqBd6rp0VFAUegw/lz+k9RU2J2e0wqDpCrvdr/AJCyJf/Uww73Fn3qblls3bgGIepV5SUMkXi4Gx2pcNSn1BQ+koRZ3W0SWl3rbQbCUarA98szKOaVgK4tIec/zt9PV6v2Zrfu0ToUvuPziYVKzmSAQNAlAdBycRtmmUVxQ4+YrWRJ1WV0FBluUNY8+gSNe5bdfZc8mYL+zjzfk1/DxpyM/AqaSMU/nhWFMEZjk6mNgHGQK4JzUHwBh1b+jWH6y2jFXrDWJ13ANUJ5pPmuK5SEM8ydlVuZZP5vEyWeuj8622XZVSdKpIXseO7qXGneUVg0adZcuvMXx+8xlRzOctctFDZHCACoCdM4XaW1f5R1bSqyUNaJ0DR2RW3dhDldt3Y/YVZdF7Rnnuz2WRDEq9pBqRlPOfcLCXDFqpkZ7GAYoCLfdi+BKppSn4knENz2KG1pouywwwcLTvNFMe7CzSd4bk0Jw36xP3loKgLBCvJTS77SDoNIg7OodsIxJtl2cO8MnAAHcHBXhHQUJfGQM0GxqLdV7Wkn13wVoe0Ibtdass+a9vU1HCaXNS6UorImq7s80g3ZG2E4qMO1AfmCihiKPnI8DVtWfC07VXnY+8hmoG2Nk4quxdpsKu0oxs572eaAZJM6BtlqEGlSMnSns7i0ir23eTZlj6Vu7eECEzYiw2Rzn27hmhyYGg2hrDvi1cMfNk2zbJVLvIPEzV3u3KQnkm6oUwLZ0wb9D2hVD/99C5pebLZ6Bz1x0yonvtZmN5Oxn0E79ZjbePW9+eOG0/GQy7xKe71Vcs8CC7yeUTb9oOcOOtZKvs8Kq2pdvaTVaGyr65W996zHZyRvRrhgSTrwo5nRAVcrw65n/CG5SdDS5E051QvRpO77BQDKerDg7Brsp67vgP0Tcl+qZVrez4yJkk6pZe1uIL9SNe/SZGBc0Bjao6SYVDGtVR9bD2al/t5KiubmPVKjYHNSlXM6kpXnvLGQ5ij/0u3srad3ZtOMPS1H5gD2qgXPCkX0fA8aC54AFTU46vr+NQL/UetSly+IB324p1DNVJVBaCrQowd6VXvex79k7iuLJmZ+clrKZOoicfUTk4HPR851s6rw0dZZPnhPPa0KnOfNvOsNVx2j6vXQ3JPyWffgBp78lIjkajXf9LhLYAqQzVdsmnGpCD+sC3xJG7n6O1wJGW0Q9HquP0w5GD5u5viCP3lUKP5shJfxw5GYIjB/3txVviSBvALCdsgSVt4Cq9dcWT+kj9MOXAPzl+K0xpG5M6IDXHpNEA4a3BUh3sRL7kt/lftEmb538WCL7/Hw==
\ No newline at end of file
+7Vxbd9soEP41fmyOELrYj0nctLvJNj1N9rR9xBaRaSXhRaix++sXZLCutuVYF7fpU6QRAjHzzcfMgDOC1+HqHUPLxT/Uw8HINLzVCE5HpgmMsSP+SMl6IxkbSuAz4qlGmeCB/MT6TSVNiIfjQkNOacDJsiic0yjCc16QIcboc7HZEw2Koy6RjyuChzkKqtLPxOMLNQvbyOTvMfEXemRgqCch0o2VIF4gjz7nRPDtCF4zSvnmKlxd40AqT+tl897NjqfbD2M44k1e4NeP4/ePtvPlk3n7IUZfP8w/P70BqpsfKEjUjEemE4gOr2ZMXPnySn0/X2uliKks5eUcMdH51fOCcPywRHMpexY4ELIFDwNxB8RlvDGq+DxxrZVgbPvNz0B/DWYcr3IiNaN3mIaYs7Voop5aSrkKXcACF6a9ET3nzKVaLXKWcswL1RApiPjb3jMtigulyGOUavziSoUlpVr9KvUWT8Pb8Bu7fwD+Pfrr75sbm77ZDVSP/Cho0vkvkT51JWbL36CA+NEIXooW35KYk6d11kCbQrHWpjfxcWmHh6yUzPBhK81oEnnYu5ttBWj+3WdSep/wgERYyT3Evt+LbgiXSjcuDLsoNFOpbKkmNJ0L+2ImBAGa4eAjjQkntPBAGpwIJrsrNQiJ58npbBtcqi45XZbRFMvhI/8OP0kLQ9gSwoDjFCCm3TgHL8uowRdwjY7gBSvw+peTQJrj/BGwoIz8pBFHeoz2IDGjnNMwA8KjhEiKjHSMq+1krmlAxQjTiMoZqaeUeZiVnuyDl9URvEzYJ77q1wS7AiTsiUhD3VLGF9SnEQreZtIrtgGPUlbW5o6mZpCm/oY5X6uwCSWcFtGHV4R/SYHj2ur2q+pNXk9X+Zu1vonEhDdvmba+/5p/mL2X3ukXm61DMU3YHO9xRaUpjpiP+Z52epmSetyLCoYDxMmPYkTXuoXNyRlZGDS1sNuBhdsziHr1IyViiK17m5ZbdG8DynkUetmgTL2Yj4rLfdmg0Bc0Sh1tYFjp6JIxtM41W8oG8Z5vLo1jGlYJcZseM/xt9fXyRc2uLGofKAvF+iHAVMZqhkTQIPpsOeI0jaJFgZNZ9EDIaXUVEegUbxh/Bnm6znz7kD8XnPnC7oewJw0JGzid8MOxrgjMYnQwsfcyBHBPag6AMere0a0/WG0Zq9YrxOq4B6hOKmvSlMxlGoLY7lzLo/MkTG19dL7VccWpJnPVJc/8KjXuLK8YNOosev4Bxxc3HzEjYt4yF91HBnuoANgZU5R7a4kidPnuIEfA8+QI7dWNV6hi+27cvqYm+sCZyP2FLFJBKUqrEWkt/4n4O8ngDAqkJQVa7sXw9VGrouBLLjQ8SziOK7osMsDA0b7TTHmws0jfGZJDM948nLi3xXBOQ4az2ya40yDuVDlkHdFoHZ45wCcDA9wdFOAdBQl9ZQzQbOgs1nl5i/7unLc84hXfuazy9E/frlIKp81J7VJaE1Hbne0EVRfSdkKRcQfqA5NSKOJU8xHgVrVnwtO1V5+PvIRqBtjZOKrsXaTClnhFb18c5pXzSDOgbRahBkvnTUrtbR2R1rfvJs2wqvu4O0ME7cREbo4K7D1wytLTQAdrDLu2cOXM023bNFPtIvMwy/5u1xbKtagXwrSqhHmL18+Ueb+Hzq1yvnwGOnfdISO6l242FreTQT/xm9V4P7n17YnT9pPBsCY+fVltx8Dd7yYXj7tVdoAbbyVbxQWvblu6rd3k0lD6m7tdW4/ZTtZEH3MsmXyZy+mkKJfjHWL+J7LC+mBwLpruhOrL4fQWC/lwuu7gEOyqrOeO/xB9U6JvWtXSx0fOJFG3qmUtYajvyfI3cSpoDuhU9UkqHNKpjqqHDVv70p6Sd6l6jZrn5VJuxaWmJEbhjPgJ4r/LamXtOrs2nGNV1L5nD+pXzAXheNBccK/v5bT+CcdJUC31HrUpsv+Ad9uKdYzyIlFbCLZqwNyVXqtl37NfJI4ra7a1m2g1XSSGWiNqvwYOer7zNZ3Xhk5pk+eE89rQqc98286wy+O0fV67HpJ/Sj79ANLekZEcjUb78C8R2gJkaai2Sz71gBx0DXxNHLn9OVoLHGkZ/XBkeZx+OHLQ3P0VceSuUujRHDnpjyMnQ3DkoL+9eE0caQOoc8IWWNIGbqm3rniyOlI/TDnwT45fC1PaxuQQkJpj0miA8NZgWR7sRL4Ut9m/s9k0z/4pEHz7Pw==
\ No newline at end of file
diff --git a/misc/pilsner-diagram.svg b/misc/pilsner-diagram.svg
index aba93f4..4876169 100644
--- a/misc/pilsner-diagram.svg
+++ b/misc/pilsner-diagram.svg
@@ -1,3 +1,3 @@
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/pilsner/__init__.py b/pilsner/__init__.py
index ebb1491..9fa0d0b 100644
--- a/pilsner/__init__.py
+++ b/pilsner/__init__.py
@@ -1,2 +1,2 @@
from .model import Model
-from .utility import Recognizer
+from .utility import Utility
diff --git a/pilsner/utility.pxd b/pilsner/utility.pxd
index 320b3fc..e27f7fe 100644
--- a/pilsner/utility.pxd
+++ b/pilsner/utility.pxd
@@ -1,6 +1,6 @@
import cython
-cdef class Recognizer():
+cdef class Utility():
cdef public bint debug
cdef public bint verbose
diff --git a/pilsner/utility.py b/pilsner/utility.py
index 19a4519..dddd41f 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -1,11 +1,11 @@
import logging
import os
-class Recognizer():
+class Utility():
"""This class is the utility for named entity recognition."""
def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, callback_progress=None):
- """Creates Recognizer instance.
+ """Creates Utility instance.
Args:
bool *debug_mode*: toggle logger level to INFO
@@ -24,7 +24,7 @@ def __init__(self, debug_mode=False, verbose_mode=False, callback_status=None, c
self.logger('Debug mode is on')
self.callback_status = callback_status
self.callback_progress = callback_progress
- logging.debug('Recognizer class has been initialized')
+ logging.debug('Utility class has been initialized')
def __del__(self):
"""Destructor."""
@@ -41,7 +41,7 @@ def push_message(self, message, callback_function):
callback_function(message)
def compile_dict_specs(self, fields):
- """Reshapes list of fields' specifications into dict used by other members of Recognizer class.
+ """Reshapes list of fields' specifications into dict used by other members of Utility class.
Returns new dict with specifications.
Args:
diff --git a/test/performance.py b/test/performance.py
index 35640e0..bb3609f 100644
--- a/test/performance.py
+++ b/test/performance.py
@@ -140,17 +140,17 @@ def perf_compile_model_save_model(modules_to_test):
import %s as pilsner
model = pilsner.Model()
model.add_normalizer('standard', None)
-recognizer = pilsner.Recognizer()
+utility = pilsner.Utility()
fields = [
{'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
{'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
{'name': 'label_attr', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
{'name': 'entity_attr', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
-specs = recognizer.compile_dict_specs(fields)
+specs = utility.compile_dict_specs(fields)
""" % (x),
stmt="""
-recognizer.compile_model(model, '.test-dict.txt', specs, ' ', '\\t', '\\n', include_keywords=True)
+utility.compile_model(model, '.test-dict.txt', specs, ' ', '\\t', '\\n', include_keywords=True)
model.destroy()
""",
number=n
@@ -166,15 +166,15 @@ def perf_compile_model_save_model(modules_to_test):
import %s as pilsner
model = pilsner.Model()
model.add_normalizer('standard', None)
-recognizer = pilsner.Recognizer()
+utility = pilsner.Utility()
fields = [
{'name': 'entity_id', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
{'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
{'name': 'label_attr', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
{'name': 'entity_attr', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
-specs = recognizer.compile_dict_specs(fields)
-recognizer.compile_model(model, '.test-dict.txt', specs, ' ', '\\t', '\\n', include_keywords=True)
+specs = utility.compile_dict_specs(fields)
+utility.compile_model(model, '.test-dict.txt', specs, ' ', '\\t', '\\n', include_keywords=True)
""" % (x),
stmt="""
model.save('.test-model')
@@ -214,12 +214,12 @@ def perf_load_model_parse_test(modules_to_test):
import %s as pilsner
model = pilsner.Model()
model.load('.test-model')
-recognizer = pilsner.Recognizer()
+utility = pilsner.Utility()
with open('.test-text.txt', mode='r', encoding='utf8') as f:
test_text = f.read()
""" % (x),
stmt="""
-found = recognizer.parse(model, test_text)
+found = utility.parse(model, test_text)
model.destroy()
""",
number=n
diff --git a/test/sandbox.py b/test/sandbox.py
index 534d6f3..75a4a49 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -18,7 +18,7 @@ def save_it():
'tokenizer1': 'tokenizer1',
'tokenizer2': 'tokenizer2'
}
- r = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
+ r = pilsner.Utility(callback_status=callback_update_status, callback_progress=callback_update_mesage)
specs = {'DType': (0, None, True, False), 'MSID': (1, None, False, False), 'value': (2, None, False, True)}
fields = [
{'name': 'DType', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
@@ -38,7 +38,7 @@ def save_it():
m.save('.test_model')
def load_it():
- rrr = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
+ rrr = pilsner.Utility(callback_status=callback_update_status, callback_progress=callback_update_mesage)
m = pilsner.Model('.test_model')
s = 'this is awesome white refrigerators o refrigerator, is it not'
s *= 10
@@ -50,7 +50,7 @@ def load_it():
load_it()
#segments = [tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])]
-#r = Recognizer()
+#r = Utility()
#red = r.reduce(segments)
#print(red)
@@ -60,7 +60,7 @@ def load_it():
#layers = [([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([0], {0: {'MSID': ['entity2'], 'smth': ['C', 'D', 'E']}}, 'acinic cell carcino mas', 8, 31)]), ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([2], {2: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'acinic carcinomas', 8, 25), ([5], {5: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'it', 45, 46), ([6], {6: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'o', 26, 27)])]
#layers = [([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([0], {0: {'MSID': ['entity2'], 'smth': ['C', 'D', 'E']}}, 'acinic cell carcino mas', 8, 31)]), ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([2], {2: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'acinic carcinomas', 8, 26), ([5], {5: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'it', 45, 46), ([6], {6: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'o', 26, 27)])]
-#rrr = pilsner.Recognizer(callback_status=callback_update_status, callback_progress=callback_update_mesage)
+#rrr = pilsner.Utility(callback_status=callback_update_status, callback_progress=callback_update_mesage)
#x = rrr.flatten(layers)
#print(x)
diff --git a/test/ut_utility.py b/test/ut_utility.py
index ff63988..afefafc 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -4,11 +4,11 @@
class TestUtility(unittest.TestCase):
def setUp(self):
- self.recognizer = pilsner.Recognizer()
+ self.utility = pilsner.Utility()
self.model = pilsner.Model()
def tearDown(self):
- del(self.recognizer)
+ del(self.utility)
self.model.destroy()
del(self.model)
@@ -19,7 +19,7 @@ def compile_test_model(self):
{'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
{'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
- specs = self.recognizer.compile_dict_specs(fields)
+ specs = self.utility.compile_dict_specs(fields)
model = self.model
model.add_normalizer('t1', 'test/assets/tokenizer1.xml')
model.add_normalizer('t2', 'test/assets/tokenizer2.xml')
@@ -27,25 +27,25 @@ def compile_test_model(self):
'tokenizer1': 't1',
'tokenizer2': 't2'
}
- compiled = self.recognizer.compile_model(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', column_separator='\t', column_enclosure='', include_keywords=True)
+ compiled = self.utility.compile_model(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', column_separator='\t', column_enclosure='', include_keywords=True)
return compiled, model
def test_init(self):
- r = pilsner.Recognizer()
- assert 'r' in locals(), 'Instance of Recognizer class has not been created'
- assert type(r) == pilsner.Recognizer, 'Utility is supposed to have pilsner.Recognizer type, but has %s instead' % (str(type(r)))
+ r = pilsner.Utility()
+ assert 'r' in locals(), 'Instance of Utility class has not been created'
+ assert type(r) == pilsner.Utility, 'Utility is supposed to have pilsner.Utility type, but has %s instead' % (str(type(r)))
def test_del(self):
- r = pilsner.Recognizer()
+ r = pilsner.Utility()
del(r)
- assert 'r' not in locals(), 'Instance of Recognizer class has not been destroyed'
+ assert 'r' not in locals(), 'Instance of Utility class has not been destroyed'
def test_push_message(self):
messages = []
def callback_function(message):
messages.append(message)
- self.recognizer.push_message('message 1', callback_function)
- self.recognizer.push_message('message 2', callback_function)
+ self.utility.push_message('message 1', callback_function)
+ self.utility.push_message('message 2', callback_function)
expected = ['message 1', 'message 2']
assert messages == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(messages))
@@ -57,7 +57,7 @@ def test_compile_dict_specs(self):
{'name': 'column 4', 'include': False, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
{'name': 'column 5', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
- specs = self.recognizer.compile_dict_specs(fields)
+ specs = self.utility.compile_dict_specs(fields)
expected = {
'fields': {
'column 1': (0, None, True, False),
@@ -78,19 +78,19 @@ def test_insert_node(self):
{'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
{'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
- specs = self.recognizer.compile_dict_specs(fields)
+ specs = self.utility.compile_dict_specs(fields)
model = self.model
model.create_recognizer_schema(model.cursor)
test_trie = {}
- self.recognizer.insert_node(label='the synonym', label_id=1, entity_id=10, subtrie=test_trie, specs=specs, columns=['', '', '', ''], model=model)
- self.recognizer.insert_node('the synthesis', 2, 20, test_trie, specs, ['', '', '', ''], model)
+ self.utility.insert_node(label='the synonym', label_id=1, entity_id=10, subtrie=test_trie, specs=specs, columns=['', '', '', ''], model=model)
+ self.utility.insert_node('the synthesis', 2, 20, test_trie, specs, ['', '', '', ''], model)
expected = {'t': {'h': {'e': {' ': {'s': {'y': {'n': {'o': {'n': {'y': {'m': {'~i': [1]}}}}, 't': {'h': {'e': {'s': {'i': {'s': {'~i': [2]}}}}}}}}}}}}}}
assert test_trie == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(test_trie))
def test_remove_node(self):
test_trie = {'t': {'h': {'e': {' ': {'s': {'y': {'n': {'o': {'n': {'y': {'m': {'~i': [1]}}}}, 't': {'h': {'e': {'s': {'i': {'s': {'~i': [1]}}}}}}}}}}}}}}
model = self.model
- self.recognizer.remove_node(model=model, label='the synonym', subtrie=test_trie)
+ self.utility.remove_node(model=model, label='the synonym', subtrie=test_trie)
expected = {'t': {'h': {'e': {' ': {'s': {'y': {'n': {'t': {'h': {'e': {'s': {'i': {'s': {'~i': [1]}}}}}}}}}}}}}}
assert test_trie == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(test_trie))
@@ -101,9 +101,9 @@ def test_make_recognizer(self):
{'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
{'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
- specs = self.recognizer.compile_dict_specs(fields)
+ specs = self.utility.compile_dict_specs(fields)
model = self.model
- got_recognizer, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', column_enclosure='', tokenizer_option=0)
+ got_recognizer, got_line_numbers = self.utility.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', column_enclosure='', tokenizer_option=0)
expected_recognizer = [
{
model.SPECS_KEY: {
@@ -146,10 +146,10 @@ def test_make_keywords(self):
{'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
{'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
- specs = self.recognizer.compile_dict_specs(fields)
+ specs = self.utility.compile_dict_specs(fields)
model = self.model
- _, got_line_numbers = self.recognizer.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', column_enclosure='', tokenizer_option=0)
- keywords = self.recognizer.make_keywords(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, line_numbers=got_line_numbers, word_separator=' ', disambiguate_all=False, column_separator='\t', column_enclosure='', tokenizer_option=0)
+ _, got_line_numbers = self.utility.make_recognizer(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', item_limit=0, compressed=True, column_separator='\t', column_enclosure='', tokenizer_option=0)
+ keywords = self.utility.make_keywords(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, line_numbers=got_line_numbers, word_separator=' ', disambiguate_all=False, column_separator='\t', column_enclosure='', tokenizer_option=0)
expected = {
model.CONTENT_KEY: {
0: {'it', 'refrigeratorx', 'white', 'awesome', 'refrigerator', 'conflicting', 'refrigerators'},
@@ -171,7 +171,7 @@ def test_make_keywords(self):
def test_compile_model(self):
compiled, model = self.compile_test_model()
- assert compiled == True, 'pilsner.Recognizer.compile_model() returned False which is not expected'
+ assert compiled == True, 'pilsner.Utility.compile_model() returned False which is not expected'
assert model.NORMALIZER_KEY in model, 'Model does not have model.NORMALIZER_KEY which is not expected'
assert model.DEFAULT_NORMALIZER_KEY in model, 'Model does not have model.DEFAULT_NORMALIZER_KEY which is not expected'
assert model.DICTIONARY_KEY in model, 'Model does not have model.DICTIONARY_KEY which is not expected'
@@ -208,7 +208,7 @@ def test_unpack_trie(self):
_, model = self.compile_test_model()
packed_trie = {'wesome white refrigera': {' ': {'tors': {model.ENTITY_KEY: [0]}}, 't': {'or': {'x': {model.ENTITY_KEY: [1]}, model.ENTITY_KEY: [4]}}}}
expected = {'w': {'e': {'s': {'o': {'m': {'e': {' ': {'w': {'h': {'i': {'t': {'e': {' ': {'r': {'e': {'f': {'r': {'i': {'g': {'e': {'r': {'a': {' ': {'tors': {'~i': [0]}}, 't': {'or': {'x': {'~i': [1]}, '~i': [4]}}}}}}}}}}}}}}}}}}}}}}}}}
- unpacked_trie = self.recognizer.unpack_trie(model=model, packed_trie=packed_trie, compressed=True)
+ unpacked_trie = self.utility.unpack_trie(model=model, packed_trie=packed_trie, compressed=True)
assert unpacked_trie == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(unpacked_trie))
def test_unpack_attributes(self):
@@ -220,7 +220,7 @@ def test_unpack_attributes(self):
process_exclude = False
attrs_out_query = ''
expected = {8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer2'], 'some_attribute': ['A', 'B', 'C']}}
- attributes = self.recognizer.unpack_attributes(cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query)
+ attributes = self.utility.unpack_attributes(cur, leaf_ids, include_query, exclude_query, process_exclude, attrs_out_query)
assert attributes == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(attributes))
def test_check_attrs(self):
@@ -232,7 +232,7 @@ def test_check_attrs(self):
process_exclude = False
attrs_out_query = ''
expected = {model.ENTITY_KEY: [8], model.ATTRS_KEY: {8: {'entity_id': ['entity1'], 'normalizer': ['tokenizer2'], 'some_attribute': ['A', 'B', 'C']}}}
- got_leaf = self.recognizer.check_attrs(model, trie_leaf, cur, include_query, exclude_query, process_exclude, attrs_out_query)
+ got_leaf = self.utility.check_attrs(model, trie_leaf, cur, include_query, exclude_query, process_exclude, attrs_out_query)
assert got_leaf == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(got_leaf))
def test_spot_entities(self):
@@ -240,7 +240,7 @@ def test_spot_entities(self):
source_string = 'this is awesome white refrigerator , and this is not'
normalizer_name = 't1'
expected = [([4], {4: {'entity_id': ['entity1'], 'normalizer': ['tokenizer1'], 'some_attribute': ['A', 'B', 'C']}}, 'awesome white refrigerator', 8, 34)]
- spotted = self.recognizer.spot_entities(model, source_string, normalizer_name)
+ spotted = self.utility.spot_entities(model, source_string, normalizer_name)
assert spotted == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(spotted))
def test_disambiguate(self):
@@ -283,7 +283,7 @@ def test_disambiguate(self):
]
)
]
- disambiguated = self.recognizer.disambiguate(model, spotted, srcs, word_separator)
+ disambiguated = self.utility.disambiguate(model, spotted, srcs, word_separator)
assert disambiguated == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(disambiguated))
def test_flatten_layers(self):
@@ -345,7 +345,7 @@ def test_flatten_layers(self):
46, 71
)
]
- flattened = self.recognizer.flatten_layers(model, layers)
+ flattened = self.utility.flatten_layers(model, layers)
assert flattened == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(flattened))
def test_flatten_spans(self):
@@ -371,13 +371,13 @@ def test_flatten_spans(self):
(40, 42): {'DType': {'tokenizer2'}, 'MSID': {'entity1'}, 'smth': {'C', 'B', 'A'}},
(46, 71): {'DType': {'tokenizer1'}, 'MSID': {'entity1'}, 'smth': {'C', 'B', 'A'}}
}
- flattened = self.recognizer.flatten_spans(spans)
+ flattened = self.utility.flatten_spans(spans)
assert flattened == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(flattened))
def test_reduce_spans(self):
segments = set([tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])])
expected = [tuple([1, 6])]
- reduced = self.recognizer.reduce_spans(segments)
+ reduced = self.utility.reduce_spans(segments)
assert reduced == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(reduced))
def test_parse(self):
@@ -389,7 +389,7 @@ def test_parse(self):
(54, 56): {'entity_id': {'entity2'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'C', 'B', 'A'}},
(66, 90): {'entity_id': {'entity2'}, 'normalizer': {'tokenizer2'}, 'some_attribute': {'D', 'E'}}
}
- output = self.recognizer.parse(model, source_string)
+ output = self.utility.parse(model, source_string)
assert output == expected, '\nExpected\n%s\nGot\n%s' % (str(expected), str(output))
if __name__ == '__main__':
From 302b60e046bf4a00173dab6a70a99da8aad1bb7c Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 4 Oct 2020 22:10:43 -0400
Subject: [PATCH 098/116] renamed Recognizer to Utility (readme)
---
README.md | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/README.md b/README.md
index 6c4e82f..0644ce7 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ The only dependency is `sic` package. While it can be automatically installed at
## 3. Diagram
-`pilsner` consists of two major components: `Model` and `Recognizer`. `Model` class provides storage for the dictionary and string normalization rules, as well as methods for populating this storage. `Recognizer` class provides methods for accessing `Model`.
+`pilsner` consists of two major components: `Model` and `Utility`. `Model` class provides storage for the dictionary and string normalization rules, as well as methods for populating this storage. `Utility` class provides methods for accessing `Model`.
![Diagram](misc/pilsner-diagram.svg)
@@ -43,7 +43,7 @@ m.normalizer_map = {
'animal': 'default',
'plant': 'custom'
}
-r = pilsner.Recognizer()
+r = pilsner.Utility()
fields = [
{
'name': 'type',
@@ -154,7 +154,7 @@ m.add_normalizer(
### 4.3. Initialize utility
```python
-r = pilsner.Recognizer()
+r = pilsner.Utility()
```
### 4.4. Add dictionary
From b65cd5883812f56422def95cb98635111f78fe6f Mon Sep 17 00:00:00 2001
From: pgolo
Date: Sun, 4 Oct 2020 22:27:11 -0400
Subject: [PATCH 099/116] readme (normalizer_map)
---
README.md | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/README.md b/README.md
index 0644ce7..6114a68 100644
--- a/README.md
+++ b/README.md
@@ -150,6 +150,17 @@ m.add_normalizer(
- Model can embed more than one normalization unit.
- Default normalization unit for the model is the one added first or the last one added with parameter `default` set to `True`.
+- Having multiple normalization units in one model makes perfect sense when the stored dictionary contains synonyms of different nature that should be normalized in different ways (for example, abbreviations probably should not get normalized at all, while other synonyms might include tokens or punctuation marks that should not affect entity recognition). For that purpose, Model class includes `normalizer_map` dict that is supposed to map names of added normalization units to values in specific field in a dictionary designating the way a synonym should be normalized (tokenizer field, or tokenizer column):
+
+```python
+# Assuming m is pilsner.Model instance:
+m.normalizer_map = {
+ 'synonym_type_1': 'normalizer_1',
+ 'synonym_type_2': 'normalizer_2'
+}
+```
+
+> The snippet above instructs `pilsner` to normalize synonyms that have `synonym_type_1` value in `tokenizer` column with `normalizer_1` normalization unit, and normalize synonyms that have `synonym_type_2` value in `tokenizer` column with `normalizer_2` normalization unit. More about fields in the dictionary, see [4.4. Add dictionary](#44-add-dictionary).
### 4.3. Initialize utility
From df46f2ff86e0dd023c7b6a426330b3f871638086 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Mon, 5 Oct 2020 21:50:01 -0400
Subject: [PATCH 100/116] only use specs structure internally
---
misc/example/example.py | 5 +----
pilsner/utility.pxd | 3 ++-
pilsner/utility.py | 19 ++++++++++++++++---
test/performance.py | 6 ++----
test/sandbox.py | 2 +-
test/ut_utility.py | 3 +--
6 files changed, 23 insertions(+), 15 deletions(-)
diff --git a/misc/example/example.py b/misc/example/example.py
index e512954..478392d 100644
--- a/misc/example/example.py
+++ b/misc/example/example.py
@@ -58,14 +58,11 @@
}
]
-# Convert table definition into internally used `specs` structure
-specs = r.compile_dict_specs(fields)
-
# Populate Model instance with data from misc/example/living_things.txt file
r.compile_model(
model=m,
filename='misc/example/living_things.txt',
- specs=specs,
+ fields=fields,
word_separator=' ',
column_separator='\t',
column_enclosure='\n',
diff --git a/pilsner/utility.pxd b/pilsner/utility.pxd
index e27f7fe..ea5ba75 100644
--- a/pilsner/utility.pxd
+++ b/pilsner/utility.pxd
@@ -118,6 +118,7 @@ cdef class Utility():
)
@cython.locals(
+ specs=cython.dict,
tries=cython.list,
line_numbers=cython.dict,
keywords=cython.dict
@@ -126,7 +127,7 @@ cdef class Utility():
self,
model,
str filename,
- dict specs,
+ list fields,
str word_separator,
str column_separator,
str column_enclosure,
diff --git a/pilsner/utility.py b/pilsner/utility.py
index dddd41f..6a35b67 100644
--- a/pilsner/utility.py
+++ b/pilsner/utility.py
@@ -239,13 +239,13 @@ def make_keywords(self, model, filename, specs, line_numbers, word_separator, di
self.logger('Done compiling keywords.')
return keywords
- def compile_model(self, model, filename, specs, word_separator, column_separator, column_enclosure, compressed=True, item_limit=0, tokenizer_option=0, include_keywords=False, disambiguate_all=False):
+ def compile_model(self, model, filename, fields, word_separator, column_separator, column_enclosure, compressed=True, item_limit=0, tokenizer_option=0, include_keywords=False, disambiguate_all=False):
"""Populates given Model instance with tries and keywords.
Args:
Model *model*: Model instance to populate
str *filename*: path and name of tab-delimited text file with the content
- dict *specs*: specifications for columns in the text file
+ list *fields*: list of dict objects defining the columns in the text file
str *word_separator*: string considered to be the word delimiter
str *column_separator*: delimiter to split columns
str *column_enclosure*: any string that columns are supposed to be trimmed of
@@ -254,7 +254,20 @@ def compile_model(self, model, filename, specs, word_separator, column_separator
int *tokenizer_option*: tokenizer mode (see documentation for normalization for details)
bool *include_keywords*: whether generate keywords at all or not
bool *disambiguate_all*: whether generate keywords for all entities or only for those having conflicting synonyms
+
+ Data structure for *fields* argument (also see compile_dict_specs() function):
+ [
+ {
+ 'name': 'str name of attribute',
+ 'include': bool True for including this column else False,
+ 'delimiter': 'str delimiter in case column stores concatenated lists',
+ 'id_flag': bool True if column stores entity ID else False,
+ 'normalizer_flag': bool True if column stores string normalizer tag else False,
+ 'value_flag': bool True if column stores string label to recognize else False
+ }
+ ]
"""
+ specs = self.compile_dict_specs(fields)
tries, line_numbers = self.make_recognizer(model, filename, specs, word_separator, item_limit, compressed, column_separator, column_enclosure, tokenizer_option)
keywords = {model.CONTENT_KEY: {}, model.INTERNAL_ID_KEY: {}}
if include_keywords:
@@ -356,7 +369,7 @@ def spot_entities(self, model, source_string, normalizer_name, include_query='',
int *progress_from*: initial progress value to report
int *progress_to*: maximum progress value to report
- Data structure for returbed value:
+ Data structure for returned value:
[
(
[int internal_ids],
diff --git a/test/performance.py b/test/performance.py
index bb3609f..635c5b7 100644
--- a/test/performance.py
+++ b/test/performance.py
@@ -147,10 +147,9 @@ def perf_compile_model_save_model(modules_to_test):
{'name': 'label_attr', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
{'name': 'entity_attr', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
-specs = utility.compile_dict_specs(fields)
""" % (x),
stmt="""
-utility.compile_model(model, '.test-dict.txt', specs, ' ', '\\t', '\\n', include_keywords=True)
+utility.compile_model(model, '.test-dict.txt', fields, ' ', '\\t', '\\n', include_keywords=True)
model.destroy()
""",
number=n
@@ -173,8 +172,7 @@ def perf_compile_model_save_model(modules_to_test):
{'name': 'label_attr', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False},
{'name': 'entity_attr', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
-specs = utility.compile_dict_specs(fields)
-utility.compile_model(model, '.test-dict.txt', specs, ' ', '\\t', '\\n', include_keywords=True)
+utility.compile_model(model, '.test-dict.txt', fields, ' ', '\\t', '\\n', include_keywords=True)
""" % (x),
stmt="""
model.save('.test-model')
diff --git a/test/sandbox.py b/test/sandbox.py
index 75a4a49..4b70774 100644
--- a/test/sandbox.py
+++ b/test/sandbox.py
@@ -28,7 +28,7 @@ def save_it():
]
specs = r.compile_dict_specs(fields)
_messages.clear()
- r.compile_model(m, 'test/assets/sample_dictionary.txt', specs, ' ', '\t', '\n', item_limit=3, include_keywords=True)
+ r.compile_model(m, 'test/assets/sample_dictionary.txt', fields, ' ', '\t', '\n', item_limit=3, include_keywords=True)
print(m['~keywords'])
#s = 'this is awwsome white refrigerator o refrigerator, is it tors not conflicting refrigerator hey'
s = 'this is awwsome white refrigerator , and it is awesome white refrigerator'
diff --git a/test/ut_utility.py b/test/ut_utility.py
index afefafc..229e09d 100644
--- a/test/ut_utility.py
+++ b/test/ut_utility.py
@@ -19,7 +19,6 @@ def compile_test_model(self):
{'name': 'label', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
{'name': 'some_attribute', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
]
- specs = self.utility.compile_dict_specs(fields)
model = self.model
model.add_normalizer('t1', 'test/assets/tokenizer1.xml')
model.add_normalizer('t2', 'test/assets/tokenizer2.xml')
@@ -27,7 +26,7 @@ def compile_test_model(self):
'tokenizer1': 't1',
'tokenizer2': 't2'
}
- compiled = self.utility.compile_model(model=model, filename='test/assets/sample_dictionary.txt', specs=specs, word_separator=' ', column_separator='\t', column_enclosure='', include_keywords=True)
+ compiled = self.utility.compile_model(model=model, filename='test/assets/sample_dictionary.txt', fields=fields, word_separator=' ', column_separator='\t', column_enclosure='', include_keywords=True)
return compiled, model
def test_init(self):
From c539b46485f0d29e8a8055327de451c8912c32b8 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 6 Oct 2020 01:05:00 -0400
Subject: [PATCH 101/116] readme (completed)
---
README.md | 161 +++++++++++++++++++++++++++---------------------------
1 file changed, 82 insertions(+), 79 deletions(-)
diff --git a/README.md b/README.md
index 6114a68..b1e0504 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@ The only dependency is `sic` package. While it can be automatically installed at
## 3. Diagram
-`pilsner` consists of two major components: `Model` and `Utility`. `Model` class provides storage for the dictionary and string normalization rules, as well as methods for populating this storage. `Utility` class provides methods for accessing `Model`.
+`pilsner` consists of two major components: `Model` and `Utility`. `Model` class provides storage for the dictionary and string normalization rules, as well as low-level methods for populating this storage. `Utility` class provides high-level methods for storing and retrieving data to/from `Model` instance.
![Diagram](misc/pilsner-diagram.svg)
@@ -33,74 +33,6 @@ The only dependency is `sic` package. While it can be automatically installed at
import pilsner
```
-```python
-import pilsner
-
-m = pilsner.Model()
-m.add_normalizer('default', 'default_normalizer.xml')
-m.add_normalizer('custom', 'custom_normalizer.xml')
-m.normalizer_map = {
- 'animal': 'default',
- 'plant': 'custom'
-}
-r = pilsner.Utility()
-fields = [
- {
- 'name': 'type',
- 'include': True,
- 'delimiter': None,
- 'id_flag': False,
- 'normalizer_flag': True,
- 'value_flag': False
- },
- {
- 'name': 'id',
- 'include': True,
- 'delimiter': None,
- 'id_flag': True,
- 'normalizer_flag': False,
- 'value_flag': False
- },
- {
- 'name': 'label',
- 'include': True,
- 'delimiter': None,
- 'id_flag': False,
- 'normalizer_flag': False,
- 'value_flag': True
- },
- {
- 'name': 'habitat',
- 'include': True,
- 'delimiter': ',',
- 'id_flag': False,
- 'normalizer_flag': False,
- 'value_flag': False
- }
-]
-specs = r.compile_dict_specs(fields)
-r.compile_model(
- model=m,
- filename='living_things.txt',
- specs=specs,
- word_separator=' ',
- column_separator='\t',
- column_enclosure='\n',
- include_keywords=True
-)
-m.save('living_things')
-m = pilsner.Model('living_things')
-text_to_parse = 'sample text here'
-parsed = r.parse(
- model=m,
- source_string=text_to_parse,
- attrs_where={
- '+': {'habitat': {'air', 'ocean'}}
- },
- attrs_out=['id', 'type', 'habitat']
-)
-```
-
### 4.1. Initialize model
- To initialize empty model:
@@ -133,8 +65,7 @@ m = pilsner.Model(filename='path/to/model')
### 4.2. Add string normalization units
-Depending on the dictionary and nature of the text supposed to be parsed, string normalization might not be required at all, and nothing specific is to be done in such case.
-
+- Depending on the dictionary and nature of the text supposed to be parsed, string normalization might not be required at all, and nothing specific is to be done here in such case.
- Without string normalization, synonyms from the dictionary will be stored as they are and looked up by recognizer case-sensitively.
- To add a single normalization unit:
@@ -160,34 +91,106 @@ m.normalizer_map = {
}
```
-> The snippet above instructs `pilsner` to normalize synonyms that have `synonym_type_1` value in `tokenizer` column with `normalizer_1` normalization unit, and normalize synonyms that have `synonym_type_2` value in `tokenizer` column with `normalizer_2` normalization unit. More about fields in the dictionary, see [4.4. Add dictionary](#44-add-dictionary).
+> The snippet above instructs `pilsner` to normalize synonyms that have `synonym_type_1` value in `tokenizer` column with `normalizer_1` normalization unit, and normalize synonyms that have `synonym_type_2` value in `tokenizer` column with `normalizer_2` normalization unit. More about fields in the dictionary, see [4.4. Define dictionary](#44-define-dictionary).
### 4.3. Initialize utility
+- To load dictionary into `Model` instance, as well as to parse text, the `Utility` instance is required:
+
```python
r = pilsner.Utility()
```
-### 4.4. Add dictionary
+### 4.4. Define dictionary
-Blah
+- Source dictionary for `pilsner` must be delimited text file.
+- Along with the source dictionary, specifications of the columns (fields) must be provided as list where each item corresponds to a column (from left to right). Each item in this list must be a dict object with string keys `name`, `include`, `delimiter`, `id_flag`, `normalizer_flag`, and `value_flag`, so that:
+ - `field['name']` is a string for column title;
+ - `field['include']` is a boolean that must be set to `True` for the column to be included in the model, otherwise `False`;
+ - `field['delimiter']` is a string that is supposed to split single cell into list of values if the column holds concatenated lists rather than individual values;
+ - `field['id_flag]` is a boolean that must be set to `True` if the column is supposed to be used for grouping synonyms (generally, entity ID is such column), otherwise `False`;
+ - `field['normalizer_flag']` is a boolean that must be set to `True` if the column holds indication on what normalization unit must be applied to this particular synonym, otherwise `False`;
+ - `field['value_flag']` is a boolean that must be set to `True` if the column holds synonyms that are supposed to be looked up when parsing a text, otherwise `False`.
+
+> If dictionary has a column flagged with `normalizer_flag`, synonym in each row will be normalized with string normalization unit which name is mapped on value in this column using `pilsner.Model.normalizer_map` dict. If value is not among `pilsner.Model.normalizer_map` keys, default normalization unit will be used.
### 4.5. Compile model
-Blah
+- To store dictionary in `Model` instance, method `compile_model` of `Utility` instance must be called with the following required parameters:
+ - `model`: pointer to initilized `Model` instance;
+ - `filename`: string with path and filename of source dictionary;
+ - `fields`: dict object with definitions of columns (see [4.4. Define dictionary](#44-define-dictionary));
+ - `word_separator`: string defining what is to be considered word separator (generally, it should be whitespace);
+ - `column_separator`: string defining what is to be considered column separator (e.g. `\t` for tab-delimited file);
+ - `column_enclosure`: string defining what is to be stripped away from cell after row has been split into columns (typically, it should be `\n` for new line character to be trimmed from the rightmost column).
+
+```python
+# Assuming m is pilsner.Model instance and r is pilsner.Utility instance:
+r.compile_model(
+ model=m,
+ filename='path/to/dictionary_in_a_text_file.txt',
+ fields=fields,
+ word_separator=' ',
+ column_separator='\t',
+ column_enclosure='\n'
+)
+```
+
+- To review optional parameters, see comments in the code.
### 4.6. Save model
-Blah
+- If `Model` instance has compiled dictionary, and if database location for the `Model` instance is not explicitly set to `':memory:'`, the data such instance is holding can be saved to disk:
+
+```python
+# Assuming m is pilsner.Model instance
+m.save('path/to/model_name')
+```
+
+- The snippet above will write the following files:
+ - `path/to/model_name.attributes`: database with attributes (fields from the dictionary that are not synonyms);
+ - `path/to/model_name.keywords`: keywords used for disambiguation;
+ - `path/to/model_name.normalizers`: string normalization units;
+ - `path/to/model_name.0.dictionary`: trie with synonyms;
+ - `path/to/model_name..dictionary`: additional tries with synonyms (`` being integer number of a trie) in case more than one trie was created (see comments in the code - `pilsner.Utility.compile_model` method, `item_limit` parameter).
### 4.7. Load model
-Blah
+- To initialize new `Model` instance using previously saved data:
+
+```python
+m = pilsner.Model(filename='path/to/model_name')
+```
+
+- Alternatively, data can be loaded to previously initialized `Model` instance:
+
+```python
+m = pilsner.Model()
+m.load('path/to/model_name')
+```
+
+- In both cases, the program will look for the following files:
+ - `path/to/model_name.attributes`: database with attributes (fields from the dictionary that are not synonyms);
+ - `path/to/model_name.keywords`: keywords used for disambiguation;
+ - `path/to/model_name.normalizers`: string normalization units;
+ - `path/to/model_name..dictionary`: tries with synonyms (`` being integer).
### 4.8. Parse string
-Blah
+- To parse a string without filtering out any synonyms and output all attributes of spotted entities:
+
+```python
+# Assuming m is pilsner.Model instance, r is pilsner.Utility instance, and text_to_parse is string to parse
+parsed = r.parse(
+ model=m,
+ source_string=text_to_parse
+)
+```
+
+- The output will be dict object where keys are tuples for location of spotted entity in a string (begin, end) and values are dicts for attributes that are associated with identified entity (`{'attribute_name': {attribute_values}}`).
+- For details about optional parameters, see comments in the code - `pilsner.Utility.parse` function).
+
## 5. Example
-Blah
+Everything written above is put together in example code, see **/misc/example/** directory in the project's repository.
From 968e95e95f7fa32289587d4458137d72e770d01e Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 6 Oct 2020 01:19:16 -0400
Subject: [PATCH 102/116] readme (hard line wrap)
---
README.md | 138 ++++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 102 insertions(+), 36 deletions(-)
diff --git a/README.md b/README.md
index b1e0504..244d7ad 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,9 @@ Python implemented library servicing named entity recognition
## 1. Purpose
-This library is Python implementation of toolkit for dictionary based named entity recognition. It is intended to store any thesaurus in a trie-like structure and identify any of stored synonyms in a string.
+This library is Python implementation of toolkit for dictionary based named
+entity recognition. It is intended to store any thesaurus in a trie-like
+structure and identify any of stored synonyms in a string.
## 2. Installation and dependencies
@@ -19,11 +21,18 @@ pip install pilsner
`pilsner` is tested in Python 3.6, 3.7, and 3.8.
-The only dependency is `sic` package. While it can be automatically installed at the time of `pilsner` installation, manual installation of `sic` beforehand might also be considered (see benchmark of cythonized vs pure Python implementation in `sic` docimentation, [https://pypi.org/project/sic/](https://pypi.org/project/sic/)).
+The only dependency is `sic` package. While it can be automatically installed
+at the time of `pilsner` installation, manual installation of `sic` beforehand
+might also be considered (see benchmark of cythonized vs pure Python
+implementation in `sic` docimentation,
+[https://pypi.org/project/sic/](https://pypi.org/project/sic/)).
## 3. Diagram
-`pilsner` consists of two major components: `Model` and `Utility`. `Model` class provides storage for the dictionary and string normalization rules, as well as low-level methods for populating this storage. `Utility` class provides high-level methods for storing and retrieving data to/from `Model` instance.
+`pilsner` consists of two major components: `Model` and `Utility`. `Model`
+class provides storage for the dictionary and string normalization rules, as
+well as low-level methods for populating this storage. `Utility` class provides
+high-level methods for storing and retrieving data to/from `Model` instance.
![Diagram](misc/pilsner-diagram.svg)
@@ -47,13 +56,15 @@ m = pilsner.Model()
m = pilsner.Model(storage_location='path/to/database.file')
```
-- To create empty model that uses database created in memory rather than on disk:
+- To create empty model that uses database created in memory rather than on
+disk:
```python
m = pilsner.Model(storage_location=':memory:')
```
-> If database is created in memory, the model cannot be later saved on disk (can only be used instantly).
+> If database is created in memory, the model cannot be later saved on disk
+(can only be used instantly).
- To load model from disk:
@@ -61,12 +72,16 @@ m = pilsner.Model(storage_location=':memory:')
m = pilsner.Model(filename='path/to/model')
```
-> More on how model is saved to and loaded from disk - see [4.6. Save model](#46-save-model) and [4.7. Load model](#47-load-model).
+> More on how model is saved to and loaded from disk - see
+[4.6. Save model](#46-save-model) and [4.7. Load model](#47-load-model).
### 4.2. Add string normalization units
-- Depending on the dictionary and nature of the text supposed to be parsed, string normalization might not be required at all, and nothing specific is to be done here in such case.
-- Without string normalization, synonyms from the dictionary will be stored as they are and looked up by recognizer case-sensitively.
+- Depending on the dictionary and nature of the text supposed to be parsed,
+string normalization might not be required at all, and nothing specific is to
+be done here in such case.
+- Without string normalization, synonyms from the dictionary will be stored as
+they are and looked up by recognizer case-sensitively.
- To add a single normalization unit:
```python
@@ -77,11 +92,22 @@ m.add_normalizer(
)
```
-> String normalization is technically done by `sic` component. See documentation for `sic` at [https://pypi.org/project/sic/](https://pypi.org/project/sic/) to learn how to design normalizer config.
+> String normalization is technically done by `sic` component. See
+> documentation for `sic` at
+> [https://pypi.org/project/sic/](https://pypi.org/project/sic/) to learn how
+> to design normalizer config.
- Model can embed more than one normalization unit.
-- Default normalization unit for the model is the one added first or the last one added with parameter `default` set to `True`.
-- Having multiple normalization units in one model makes perfect sense when the stored dictionary contains synonyms of different nature that should be normalized in different ways (for example, abbreviations probably should not get normalized at all, while other synonyms might include tokens or punctuation marks that should not affect entity recognition). For that purpose, Model class includes `normalizer_map` dict that is supposed to map names of added normalization units to values in specific field in a dictionary designating the way a synonym should be normalized (tokenizer field, or tokenizer column):
+- Default normalization unit for the model is the one added first or the last
+one added with parameter `default` set to `True`.
+- Having multiple normalization units in one model makes perfect sense when the
+stored dictionary contains synonyms of different nature that should be
+normalized in different ways (for example, abbreviations probably should not
+get normalized at all, while other synonyms might include tokens or punctuation
+marks that should not affect entity recognition). For that purpose, Model class
+includes `normalizer_map` dict that is supposed to map names of added
+normalization units to values in specific field in a dictionary designating the
+way a synonym should be normalized (tokenizer field, or tokenizer column):
```python
# Assuming m is pilsner.Model instance:
@@ -91,11 +117,16 @@ m.normalizer_map = {
}
```
-> The snippet above instructs `pilsner` to normalize synonyms that have `synonym_type_1` value in `tokenizer` column with `normalizer_1` normalization unit, and normalize synonyms that have `synonym_type_2` value in `tokenizer` column with `normalizer_2` normalization unit. More about fields in the dictionary, see [4.4. Define dictionary](#44-define-dictionary).
+> The snippet above instructs `pilsner` to normalize synonyms that have
+> `synonym_type_1` value in `tokenizer` column with `normalizer_1`
+> normalization unit, and normalize synonyms that have `synonym_type_2` value
+> in `tokenizer` column with `normalizer_2` normalization unit. For more about
+> fields in a dictionary, see [4.4. Define dictionary](#44-define-dictionary).
### 4.3. Initialize utility
-- To load dictionary into `Model` instance, as well as to parse text, the `Utility` instance is required:
+- To load dictionary into `Model` instance, as well as to parse text, the
+`Utility` instance is required:
```python
r = pilsner.Utility()
@@ -104,25 +135,48 @@ r = pilsner.Utility()
### 4.4. Define dictionary
- Source dictionary for `pilsner` must be delimited text file.
-- Along with the source dictionary, specifications of the columns (fields) must be provided as list where each item corresponds to a column (from left to right). Each item in this list must be a dict object with string keys `name`, `include`, `delimiter`, `id_flag`, `normalizer_flag`, and `value_flag`, so that:
+- Along with the source dictionary, specifications of the columns (fields) must
+be provided as list where each item corresponds to a column (from left to
+right). Each item in this list must be a dict object with string keys `name`,
+`include`, `delimiter`, `id_flag`, `normalizer_flag`, and `value_flag`, so
+that:
- `field['name']` is a string for column title;
- - `field['include']` is a boolean that must be set to `True` for the column to be included in the model, otherwise `False`;
- - `field['delimiter']` is a string that is supposed to split single cell into list of values if the column holds concatenated lists rather than individual values;
- - `field['id_flag]` is a boolean that must be set to `True` if the column is supposed to be used for grouping synonyms (generally, entity ID is such column), otherwise `False`;
- - `field['normalizer_flag']` is a boolean that must be set to `True` if the column holds indication on what normalization unit must be applied to this particular synonym, otherwise `False`;
- - `field['value_flag']` is a boolean that must be set to `True` if the column holds synonyms that are supposed to be looked up when parsing a text, otherwise `False`.
-
-> If dictionary has a column flagged with `normalizer_flag`, synonym in each row will be normalized with string normalization unit which name is mapped on value in this column using `pilsner.Model.normalizer_map` dict. If value is not among `pilsner.Model.normalizer_map` keys, default normalization unit will be used.
+ - `field['include']` is a boolean that must be set to `True` for the column
+ to be included in the model, otherwise `False`;
+ - `field['delimiter']` is a string that is supposed to split single cell into
+ list of values if the column holds concatenated lists rather than individual
+ values;
+ - `field['id_flag]` is a boolean that must be set to `True` if the column is
+ supposed to be used for grouping synonyms (generally, entity ID is such
+ column), otherwise `False`;
+ - `field['normalizer_flag']` is a boolean that must be set to `True` if the
+ column holds indication on what normalization unit must be applied to this
+ particular synonym, otherwise `False`;
+ - `field['value_flag']` is a boolean that must be set to `True` if the column
+ holds synonyms that are supposed to be looked up when parsing a text,
+ otherwise `False`.
+
+> If dictionary has a column flagged with `normalizer_flag`, synonym in each
+> row will be normalized with string normalization unit which name is mapped on
+> value in this column using `pilsner.Model.normalizer_map` dict. If value is
+> not among `pilsner.Model.normalizer_map` keys, default normalization unit
+> will be used.
### 4.5. Compile model
-- To store dictionary in `Model` instance, method `compile_model` of `Utility` instance must be called with the following required parameters:
+- To store dictionary in `Model` instance, method `compile_model` of `Utility`
+instance must be called with the following required parameters:
- `model`: pointer to initilized `Model` instance;
- `filename`: string with path and filename of source dictionary;
- - `fields`: dict object with definitions of columns (see [4.4. Define dictionary](#44-define-dictionary));
- - `word_separator`: string defining what is to be considered word separator (generally, it should be whitespace);
- - `column_separator`: string defining what is to be considered column separator (e.g. `\t` for tab-delimited file);
- - `column_enclosure`: string defining what is to be stripped away from cell after row has been split into columns (typically, it should be `\n` for new line character to be trimmed from the rightmost column).
+ - `fields`: dict object with definitions of columns (see
+ [4.4. Define dictionary](#44-define-dictionary));
+ - `word_separator`: string defining what is to be considered word separator
+ (generally, it should be whitespace);
+ - `column_separator`: string defining what is to be considered column
+ separator (e.g. `\t` for tab-delimited file);
+ - `column_enclosure`: string defining what is to be stripped away from cell
+ after row has been split into columns (typically, it should be `\n` for new
+ line character to be trimmed from the rightmost column).
```python
# Assuming m is pilsner.Model instance and r is pilsner.Utility instance:
@@ -140,7 +194,9 @@ r.compile_model(
### 4.6. Save model
-- If `Model` instance has compiled dictionary, and if database location for the `Model` instance is not explicitly set to `':memory:'`, the data such instance is holding can be saved to disk:
+- If `Model` instance has compiled dictionary, and if database location for the
+`Model` instance is not explicitly set to `':memory:'`, the data such instance
+is holding can be saved to disk:
```python
# Assuming m is pilsner.Model instance
@@ -148,11 +204,15 @@ m.save('path/to/model_name')
```
- The snippet above will write the following files:
- - `path/to/model_name.attributes`: database with attributes (fields from the dictionary that are not synonyms);
+ - `path/to/model_name.attributes`: database with attributes (fields from the
+ dictionary that are not synonyms);
- `path/to/model_name.keywords`: keywords used for disambiguation;
- `path/to/model_name.normalizers`: string normalization units;
- `path/to/model_name.0.dictionary`: trie with synonyms;
- - `path/to/model_name..dictionary`: additional tries with synonyms (`` being integer number of a trie) in case more than one trie was created (see comments in the code - `pilsner.Utility.compile_model` method, `item_limit` parameter).
+ - `path/to/model_name..dictionary`: additional tries with synonyms (``
+ being integer number of a trie) in case more than one trie was created (see
+ comments in the code - `pilsner.Utility.compile_model` method, `item_limit`
+ parameter).
### 4.7. Load model
@@ -173,24 +233,30 @@ m.load('path/to/model_name')
- `path/to/model_name.attributes`: database with attributes (fields from the dictionary that are not synonyms);
- `path/to/model_name.keywords`: keywords used for disambiguation;
- `path/to/model_name.normalizers`: string normalization units;
- - `path/to/model_name..dictionary`: tries with synonyms (`` being integer).
+ - `path/to/model_name..dictionary`: tries with synonyms (`` being
+ integer).
### 4.8. Parse string
-- To parse a string without filtering out any synonyms and output all attributes of spotted entities:
+- To parse a string without filtering out any synonyms and output all
+attributes of spotted entities:
```python
-# Assuming m is pilsner.Model instance, r is pilsner.Utility instance, and text_to_parse is string to parse
+# Assuming m is pilsner.Model instance, r is pilsner.Utility instance,
+# and text_to_parse is string to parse
parsed = r.parse(
model=m,
source_string=text_to_parse
)
```
-- The output will be dict object where keys are tuples for location of spotted entity in a string (begin, end) and values are dicts for attributes that are associated with identified entity (`{'attribute_name': {attribute_values}}`).
-- For details about optional parameters, see comments in the code - `pilsner.Utility.parse` function).
-
+- The output will be dict object where keys are tuples for location of spotted
+entity in a string (begin, end) and values are dicts for attributes that are
+associated with identified entity (`{'attribute_name': {attribute_values}}`).
+- For details about optional parameters, see comments in the code -
+`pilsner.Utility.parse` function).
## 5. Example
-Everything written above is put together in example code, see **/misc/example/** directory in the project's repository.
+Everything written above is put together in example code,
+see **/misc/example/** directory in the project's repository.
From 87479547cef4f0b7a2287aed5372d2912062af28 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 6 Oct 2020 22:42:35 -0400
Subject: [PATCH 103/116] example
---
misc/example/custom_normalizer.xml | 5 +++++
misc/example/default_normalizer.xml | 6 ++++++
misc/example/example.py | 15 +++++++++------
misc/example/living_things.txt | 5 ++++-
4 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/misc/example/custom_normalizer.xml b/misc/example/custom_normalizer.xml
index e69de29..71a4048 100644
--- a/misc/example/custom_normalizer.xml
+++ b/misc/example/custom_normalizer.xml
@@ -0,0 +1,5 @@
+
+
+
+
+
diff --git a/misc/example/default_normalizer.xml b/misc/example/default_normalizer.xml
index e69de29..424ab57 100644
--- a/misc/example/default_normalizer.xml
+++ b/misc/example/default_normalizer.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
diff --git a/misc/example/example.py b/misc/example/example.py
index 478392d..7e9f0fa 100644
--- a/misc/example/example.py
+++ b/misc/example/example.py
@@ -10,8 +10,8 @@
m = pilsner.Model()
# Add normalization units
-#m.add_normalizer('default', 'misc/example/default_normalizer.xml')
-#m.add_normalizer('custom', 'misc/example/custom_normalizer.xml')
+m.add_normalizer('default', 'misc/example/default_normalizer.xml')
+m.add_normalizer('custom', 'misc/example/custom_normalizer.xml')
# Map names of normalization units to some string values
m.normalizer_map = {
@@ -76,15 +76,18 @@
m = pilsner.Model('misc/example/living_things')
# Parse string
-text_to_parse = 'sample text here a b c d c b a'
+text_to_parse = '''
+Little mouse is not recognized and is not frightened by big scary eagle.
+Daniorerio also does not care much about water lilies, though both are recognized.
+'''
parsed = r.parse(
model=m,
source_string=text_to_parse,
attrs_where={
- '+': {'habitat': {'air', 'ocean'}}
+ '+': {'habitat': {'air', 'ocean'}} # only consider items with these values in 'habitat' column
},
- attrs_out=['id', 'type', 'habitat']
+ attrs_out=['id', 'type', 'habitat'] # for each spotted entity, output 'id', 'type', and 'habitat' attributes
)
-# Print out the result
+# Print out the result: recognized are 'big eagle', 'danio rerio', 'water lily'.
print(parsed)
diff --git a/misc/example/living_things.txt b/misc/example/living_things.txt
index 593d58d..444832d 100644
--- a/misc/example/living_things.txt
+++ b/misc/example/living_things.txt
@@ -1 +1,4 @@
-a b c air
+animal 1 little mouse ground
+animal 2 big eagle air
+animal 3 danio rerio ocean
+plant 4 water lily ocean
From 5e2453c962681c4313fde91890d757b054ce6ac8 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Tue, 6 Oct 2020 22:53:30 -0400
Subject: [PATCH 104/116] simplified example
---
misc/example/example.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/misc/example/example.py b/misc/example/example.py
index 7e9f0fa..c70ae0c 100644
--- a/misc/example/example.py
+++ b/misc/example/example.py
@@ -86,7 +86,7 @@
attrs_where={
'+': {'habitat': {'air', 'ocean'}} # only consider items with these values in 'habitat' column
},
- attrs_out=['id', 'type', 'habitat'] # for each spotted entity, output 'id', 'type', and 'habitat' attributes
+ attrs_out=['type'] # for each spotted entity, only output 'type' attribute
)
# Print out the result: recognized are 'big eagle', 'danio rerio', 'water lily'.
From 3b34104e74aff12be4ba08d6bfec38808a06fbae Mon Sep 17 00:00:00 2001
From: pgolo
Date: Wed, 7 Oct 2020 20:11:07 -0400
Subject: [PATCH 105/116] adjusted norm configs for example
---
misc/example/custom_normalizer.xml | 2 +-
misc/example/default_normalizer.xml | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/misc/example/custom_normalizer.xml b/misc/example/custom_normalizer.xml
index 71a4048..12b1911 100644
--- a/misc/example/custom_normalizer.xml
+++ b/misc/example/custom_normalizer.xml
@@ -1,5 +1,5 @@
-
+
diff --git a/misc/example/default_normalizer.xml b/misc/example/default_normalizer.xml
index 424ab57..709f5c7 100644
--- a/misc/example/default_normalizer.xml
+++ b/misc/example/default_normalizer.xml
@@ -1,5 +1,5 @@
-
+
From 8769c542441aba8ba7bdefdf6e5ab07d7fc6d712 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 01:16:25 -0400
Subject: [PATCH 106/116] updated gitignore
---
.gitignore | 1 +
1 file changed, 1 insertion(+)
diff --git a/.gitignore b/.gitignore
index cb8f20e..a814b03 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,5 +5,6 @@ cythonized/*
dist/*
bin/*
*.spec
+pilsner.egg-info/*
!**/.gitkeep
!**/.gitignore
From 49b4cf769a915c38b817fcf3657a03ef77a6243e Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 01:16:40 -0400
Subject: [PATCH 107/116] updated dependencies
---
requirements.txt | 2 ++
1 file changed, 2 insertions(+)
diff --git a/requirements.txt b/requirements.txt
index c8c5d07..da74093 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,3 @@
+Cython==0.29.19
+setuptools==47.1.1
sic>=1.0.4 --no-binary sic
From 1c3ed90086bbc9a6975b5b4010576e353a045e82 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 01:18:01 -0400
Subject: [PATCH 108/116] setup
---
MANIFEST.in | 2 ++
shipping/cythonize.py | 5 +++++
shipping/make_setup.py | 16 ++++++++++++++++
shipping/setup.py | 36 ++++++++++++++++++++++++++++++++++++
4 files changed, 59 insertions(+)
create mode 100644 MANIFEST.in
create mode 100644 shipping/cythonize.py
create mode 100644 shipping/make_setup.py
create mode 100644 shipping/setup.py
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..04f196a
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,2 @@
+include README.md
+include LICENSE
diff --git a/shipping/cythonize.py b/shipping/cythonize.py
new file mode 100644
index 0000000..660f7b6
--- /dev/null
+++ b/shipping/cythonize.py
@@ -0,0 +1,5 @@
+try:
+ from Cython.Build import cythonize
+ ext_modules = cythonize(['pilsner/model.py', 'pilsner/utility.py'], compiler_directives={'language_level': '3'})
+except:
+ pass
diff --git a/shipping/make_setup.py b/shipping/make_setup.py
new file mode 100644
index 0000000..cd33a97
--- /dev/null
+++ b/shipping/make_setup.py
@@ -0,0 +1,16 @@
+import sys
+
+def just_do_it(option):
+ cythonize = ''
+ if option in ['bdist_wheel']:
+ with open('shipping/cythonize.py', mode='r', encoding='utf8') as f:
+ cythonize = f.read()
+ with open('shipping/setup.py', mode='r', encoding='utf8') as i, open('./setup.py', mode='w', encoding='utf8') as o:
+ for line in i:
+ if line.strip() != '# pilsner: cythonize?':
+ o.write(line)
+ else:
+ o.write(cythonize)
+
+if __name__ == '__main__':
+ just_do_it(sys.argv[1] if len(sys.argv) > 1 else '')
diff --git a/shipping/setup.py b/shipping/setup.py
new file mode 100644
index 0000000..44683dd
--- /dev/null
+++ b/shipping/setup.py
@@ -0,0 +1,36 @@
+import sys
+from setuptools import setup
+
+ext_modules = None
+with open('README.md', mode='r', encoding='utf8') as f:
+ long_description = f.read()
+
+# pilsner: cythonize?
+
+setup(
+ name='pilsner',
+ version='X.X.X',
+ description='Utility for dictionary-based named entity recognition',
+ long_description=long_description,
+ long_description_content_type='text/markdown',
+ url='https://github.com/pgolo/pilsner',
+ author='Pavel Golovatenko-Abramov',
+ author_email='p.golovatenko@gmail.com',
+ packages=['pilsner'],
+ ext_modules=ext_modules,
+ include_package_data=True,
+ license='MIT',
+ platforms=['any'],
+ classifiers=[
+ 'Development Status :: 3 - Alpha',
+ 'Intended Audience :: Developers',
+ 'Topic :: Software Development :: Libraries :: Python Modules',
+ 'Topic :: Text Processing :: Linguistic',
+ 'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
+ 'License :: OSI Approved :: MIT License',
+ 'Operating System :: OS Independent',
+ ],
+ python_requires='>=3.6'
+)
From cf55e6469d1a8c6896e54a9ecedb033dbfaf67e0 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 01:18:11 -0400
Subject: [PATCH 109/116] building scripts
---
scripts/linux/buildtargz.sh | 35 +++++++++++++++++++++++++++++++++++
scripts/linux/buildwheel.sh | 35 +++++++++++++++++++++++++++++++++++
scripts/win/buildtargz.bat | 23 +++++++++++++++++++++++
scripts/win/buildwheel.bat | 32 ++++++++++++++++++++++++++++++++
4 files changed, 125 insertions(+)
create mode 100644 scripts/linux/buildtargz.sh
create mode 100644 scripts/linux/buildwheel.sh
create mode 100644 scripts/win/buildtargz.bat
create mode 100644 scripts/win/buildwheel.bat
diff --git a/scripts/linux/buildtargz.sh b/scripts/linux/buildtargz.sh
new file mode 100644
index 0000000..eaa0ff1
--- /dev/null
+++ b/scripts/linux/buildtargz.sh
@@ -0,0 +1,35 @@
+# Usage:
+# buildtargz.sh path/to/python
+#
+# The package will be placed in dist/ directory.
+
+RUNDIR=`pwd`
+cd `dirname $0`
+MYDIR=`pwd`
+ROOT=${MYDIR}/../..
+REQUIREMENTS=${ROOT}/requirements.txt
+ENV=${ROOT}/.env.build
+SHIPPING=${ROOT}/shipping
+
+if [ $# -eq 0 ]
+then
+ cd ${RUNDIR}
+ exit
+fi
+
+if [ ! -f $1 ]
+then
+ echo $1: Python not found
+ cd ${RUNDIR}
+ exit
+fi
+
+cd ${ROOT}
+
+virtualenv -p $1 ${ENV}
+${ENV}/bin/python3 ${SHIPPING}/make_setup.py sdist
+${ENV}/bin/python3 ${ROOT}/setup.py sdist
+
+rm ${ROOT}/setup.py
+
+cd ${RUNDIR}
diff --git a/scripts/linux/buildwheel.sh b/scripts/linux/buildwheel.sh
new file mode 100644
index 0000000..2fc6803
--- /dev/null
+++ b/scripts/linux/buildwheel.sh
@@ -0,0 +1,35 @@
+# Usage:
+# buildwheel.sh path/to/python3.6 path/to/python3.7 path/to/python3.8
+#
+# The wheels will be placed in dist/ directory.
+
+RUNDIR=`pwd`
+cd `dirname $0`
+MYDIR=`pwd`
+ROOT=${MYDIR}/../..
+REQUIREMENTS=${ROOT}/requirements.txt
+ENV=${ROOT}/.env.build
+SHIPPING=${ROOT}/shipping
+
+cd ${ROOT}
+
+for PY in "$@"
+do
+ if [ ! -f ${PY} ]
+ then
+ echo ${PY}: Python not found
+ else
+ virtualenv -p ${PY} ${ENV}
+ ${ENV}/bin/python3 -m pip install --no-cache-dir -r ${REQUIREMENTS}
+ ${ENV}/bin/python3 ${SHIPPING}/make_setup.py bdist_wheel
+ ${ENV}/bin/python3 ${ROOT}/setup.py bdist_wheel
+ rm -r ${ENV}
+ rm -r ${ROOT}/pilsner.egg-info
+ rm -r ${ROOT}/build
+ rm ${ROOT}/pilsner/model.c
+ rm ${ROOT}/pilsner/utility.c
+ rm ${ROOT}/setup.py
+ fi
+done
+
+cd ${RUNDIR}
diff --git a/scripts/win/buildtargz.bat b/scripts/win/buildtargz.bat
new file mode 100644
index 0000000..76cc2ba
--- /dev/null
+++ b/scripts/win/buildtargz.bat
@@ -0,0 +1,23 @@
+@echo off
+rem Usage:
+rem buildtargz.bat path\to\python
+rem
+rem The package will be placed in dist\ directory.
+
+set RUNDIR=%cd%
+set MYDIR=%~dp0
+set ROOT=%MYDIR%\..\..
+set ENV=%ROOT%\.env.build
+set SHIPPING=%ROOT%\shipping
+
+if (%1)==() (cd %RUNDIR% && exit)
+if not exist "%1" (echo "%1": Python not found && cd %RUNDIR% && exit)
+cd "%ROOT%"
+virtualenv -p "%1" "%ENV%"
+"%ENV%"\Scripts\python "%SHIPPING%"\make_setup.py sdist
+"%ENV%"\Scripts\python "%ROOT%"\setup.py sdist
+rmdir /S /Q "%ENV%"
+
+del /Q "%ROOT%"\setup.py
+
+cd %RUNDIR%
diff --git a/scripts/win/buildwheel.bat b/scripts/win/buildwheel.bat
new file mode 100644
index 0000000..aaeaf4d
--- /dev/null
+++ b/scripts/win/buildwheel.bat
@@ -0,0 +1,32 @@
+@echo off
+rem Usage:
+rem buildwheel.bat path\to\python36 path\to\python37 path\to\python38
+rem
+rem The package will be placed in dist\ directory.
+
+set RUNDIR=%cd%
+set MYDIR=%~dp0
+set ROOT=%MYDIR%\..\..
+set REQUIREMENTS=%ROOT%\requirements.txt
+set ENV=%ROOT%\.env.build
+set SHIPPING=%ROOT%\shipping
+
+:BUILD
+if (%1)==() (goto FINISH)
+if not exist "%1" (echo "%1": Python not found && shift && goto BUILD)
+cd "%ROOT%"
+virtualenv -p "%1" "%ENV%"
+"%ENV%"\Scripts\python -m pip install --no-cache-dir -r "%REQUIREMENTS%"
+"%ENV%"\Scripts\python "%SHIPPING%"\make_setup.py bdist_wheel
+"%ENV%"\Scripts\python "%ROOT%"\setup.py bdist_wheel
+rmdir /S /Q "%ENV%"
+rmdir /S /Q "%ROOT%"\pilsner.egg-info
+rmdir /S /Q "%ROOT%"\build
+del /Q "%ROOT%"\pilsner\model.c
+del /Q "%ROOT%"\pilsner\utility.c
+shift
+goto BUILD
+
+:FINISH
+del /Q "%ROOT%"\setup.py
+cd %RUNDIR%
From 6e8353a2beac2de5c67b39cd6cb56207988c8364 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 01:39:58 -0400
Subject: [PATCH 110/116] updated reqs
---
requirements.txt => requirements-build.txt | 1 -
requirements-dev.txt | 1 +
scripts/linux/buildtargz.sh | 1 -
scripts/linux/buildwheel.sh | 2 +-
scripts/win/buildtargz.bat | 1 +
scripts/win/buildwheel.bat | 2 +-
shipping/setup.py | 3 ++-
7 files changed, 6 insertions(+), 5 deletions(-)
rename requirements.txt => requirements-build.txt (56%)
create mode 100644 requirements-dev.txt
diff --git a/requirements.txt b/requirements-build.txt
similarity index 56%
rename from requirements.txt
rename to requirements-build.txt
index da74093..f1fa460 100644
--- a/requirements.txt
+++ b/requirements-build.txt
@@ -1,3 +1,2 @@
Cython==0.29.19
setuptools==47.1.1
-sic>=1.0.4 --no-binary sic
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..c8c5d07
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1 @@
+sic>=1.0.4 --no-binary sic
diff --git a/scripts/linux/buildtargz.sh b/scripts/linux/buildtargz.sh
index eaa0ff1..1d5e4d7 100644
--- a/scripts/linux/buildtargz.sh
+++ b/scripts/linux/buildtargz.sh
@@ -7,7 +7,6 @@ RUNDIR=`pwd`
cd `dirname $0`
MYDIR=`pwd`
ROOT=${MYDIR}/../..
-REQUIREMENTS=${ROOT}/requirements.txt
ENV=${ROOT}/.env.build
SHIPPING=${ROOT}/shipping
diff --git a/scripts/linux/buildwheel.sh b/scripts/linux/buildwheel.sh
index 2fc6803..31f8f49 100644
--- a/scripts/linux/buildwheel.sh
+++ b/scripts/linux/buildwheel.sh
@@ -7,7 +7,7 @@ RUNDIR=`pwd`
cd `dirname $0`
MYDIR=`pwd`
ROOT=${MYDIR}/../..
-REQUIREMENTS=${ROOT}/requirements.txt
+REQUIREMENTS=${ROOT}/requirements-build.txt
ENV=${ROOT}/.env.build
SHIPPING=${ROOT}/shipping
diff --git a/scripts/win/buildtargz.bat b/scripts/win/buildtargz.bat
index 76cc2ba..8321369 100644
--- a/scripts/win/buildtargz.bat
+++ b/scripts/win/buildtargz.bat
@@ -17,6 +17,7 @@ virtualenv -p "%1" "%ENV%"
"%ENV%"\Scripts\python "%SHIPPING%"\make_setup.py sdist
"%ENV%"\Scripts\python "%ROOT%"\setup.py sdist
rmdir /S /Q "%ENV%"
+rmdir /S /Q "%ROOT%"\pilsner.egg-info
del /Q "%ROOT%"\setup.py
diff --git a/scripts/win/buildwheel.bat b/scripts/win/buildwheel.bat
index aaeaf4d..c2a0a70 100644
--- a/scripts/win/buildwheel.bat
+++ b/scripts/win/buildwheel.bat
@@ -7,7 +7,7 @@ rem The package will be placed in dist\ directory.
set RUNDIR=%cd%
set MYDIR=%~dp0
set ROOT=%MYDIR%\..\..
-set REQUIREMENTS=%ROOT%\requirements.txt
+set REQUIREMENTS=%ROOT%\requirements-build.txt
set ENV=%ROOT%\.env.build
set SHIPPING=%ROOT%\shipping
diff --git a/shipping/setup.py b/shipping/setup.py
index 44683dd..7fc847a 100644
--- a/shipping/setup.py
+++ b/shipping/setup.py
@@ -32,5 +32,6 @@
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
],
- python_requires='>=3.6'
+ python_requires='>=3.6',
+ install_requires='sic>=1.0.4'
)
From 8966b8ee8db4b54180f3ce6a260db404fd38bc26 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 21:26:10 -0400
Subject: [PATCH 111/116] building scripts and chmod (linux)
---
scripts/linux/buildtargz.sh | 2 ++
scripts/linux/buildwheel.sh | 0
2 files changed, 2 insertions(+)
mode change 100644 => 100755 scripts/linux/buildtargz.sh
mode change 100644 => 100755 scripts/linux/buildwheel.sh
diff --git a/scripts/linux/buildtargz.sh b/scripts/linux/buildtargz.sh
old mode 100644
new mode 100755
index 1d5e4d7..e1f9847
--- a/scripts/linux/buildtargz.sh
+++ b/scripts/linux/buildtargz.sh
@@ -29,6 +29,8 @@ virtualenv -p $1 ${ENV}
${ENV}/bin/python3 ${SHIPPING}/make_setup.py sdist
${ENV}/bin/python3 ${ROOT}/setup.py sdist
+rm -r ${ENV}
+rm -r ${ROOT}/pilsner.egg-info
rm ${ROOT}/setup.py
cd ${RUNDIR}
diff --git a/scripts/linux/buildwheel.sh b/scripts/linux/buildwheel.sh
old mode 100644
new mode 100755
From 66e6653f98f3cacee84986c2256b9ea8a47883ae Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 22:08:26 -0400
Subject: [PATCH 112/116] removed sandbox
---
test/sandbox.py | 67 -------------------------------------------------
1 file changed, 67 deletions(-)
delete mode 100644 test/sandbox.py
diff --git a/test/sandbox.py b/test/sandbox.py
deleted file mode 100644
index 4b70774..0000000
--- a/test/sandbox.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import sys; sys.path.insert(0, '')
-import pilsner # pylint: disable=E0611,F0401
-
-_messages = []
-_status = []
-
-def callback_update_mesage(message):
- _messages.append(message)
-
-def callback_update_status(status):
- _status.append(status)
-
-def save_it():
- m = pilsner.Model()
- m.add_normalizer('tokenizer1', 'test/assets/tokenizer1.xml')
- m.add_normalizer('tokenizer2', 'test/assets/tokenizer2.xml')
- m.normalizer_map = {
- 'tokenizer1': 'tokenizer1',
- 'tokenizer2': 'tokenizer2'
- }
- r = pilsner.Utility(callback_status=callback_update_status, callback_progress=callback_update_mesage)
- specs = {'DType': (0, None, True, False), 'MSID': (1, None, False, False), 'value': (2, None, False, True)}
- fields = [
- {'name': 'DType', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': True, 'value_flag': False},
- {'name': 'MSID', 'include': True, 'delimiter': None, 'id_flag': True, 'normalizer_flag': False, 'value_flag': False},
- {'name': 'Value', 'include': True, 'delimiter': None, 'id_flag': False, 'normalizer_flag': False, 'value_flag': True},
- {'name': 'smth', 'include': True, 'delimiter': ',', 'id_flag': False, 'normalizer_flag': False, 'value_flag': False}
- ]
- specs = r.compile_dict_specs(fields)
- _messages.clear()
- r.compile_model(m, 'test/assets/sample_dictionary.txt', fields, ' ', '\t', '\n', item_limit=3, include_keywords=True)
- print(m['~keywords'])
- #s = 'this is awwsome white refrigerator o refrigerator, is it tors not conflicting refrigerator hey'
- s = 'this is awwsome white refrigerator , and it is awesome white refrigerator'
- _messages.clear()
- #q = r.parse(m, s)
- #print(q)
- m.save('.test_model')
-
-def load_it():
- rrr = pilsner.Utility(callback_status=callback_update_status, callback_progress=callback_update_mesage)
- m = pilsner.Model('.test_model')
- s = 'this is awesome white refrigerators o refrigerator, is it not'
- s *= 10
- _messages.clear()
- q = rrr.parse(m, s, attrs_where={'+': {'smth': {'D', 'A'}}}, attrs_out=['MSID', 'smth'])
- #print(q)
-
-save_it()
-load_it()
-
-#segments = [tuple([1, 2]), tuple([3, 8]), tuple([1, 6]), tuple([2, 3])]
-#r = Utility()
-#red = r.reduce(segments)
-#print(red)
-
-print(_messages)
-print(_status)
-
-#layers = [([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([0], {0: {'MSID': ['entity2'], 'smth': ['C', 'D', 'E']}}, 'acinic cell carcino mas', 8, 31)]), ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([2], {2: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'acinic carcinomas', 8, 25), ([5], {5: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'it', 45, 46), ([6], {6: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'o', 26, 27)])]
-#layers = [([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 30, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([0], {0: {'MSID': ['entity2'], 'smth': ['C', 'D', 'E']}}, 'acinic cell carcino mas', 8, 31)]), ([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 42, 43, 44, 45, 46, 47, 48, 49, 50], [([2], {2: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'acinic carcinomas', 8, 26), ([5], {5: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'it', 45, 46), ([6], {6: {'MSID': ['entity1'], 'smth': ['A', 'B', 'C']}}, 'o', 26, 27)])]
-
-#rrr = pilsner.Utility(callback_status=callback_update_status, callback_progress=callback_update_mesage)
-#x = rrr.flatten(layers)
-#print(x)
-
-
From cab1171d0b4d1e79da5e58aea22218576e183bb9 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 22:14:03 -0400
Subject: [PATCH 113/116] updated gitignore
---
.gitignore | 1 -
1 file changed, 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index a814b03..fa6fa9d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,6 @@
**/__pycache__/*
build/*
cythonized/*
-dist/*
bin/*
*.spec
pilsner.egg-info/*
From 6db9f1b52d45dcf9deeff78d438765fe0e5a7630 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 22:14:13 -0400
Subject: [PATCH 114/116] updated version
---
shipping/setup.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/shipping/setup.py b/shipping/setup.py
index 7fc847a..17cca19 100644
--- a/shipping/setup.py
+++ b/shipping/setup.py
@@ -9,7 +9,7 @@
setup(
name='pilsner',
- version='X.X.X',
+ version='0.0.1',
description='Utility for dictionary-based named entity recognition',
long_description=long_description,
long_description_content_type='text/markdown',
From 578ac2ce7af182ab7f969e10484a3454007e7367 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 22:21:16 -0400
Subject: [PATCH 115/116] changelog version 0.0.1
---
CHANGELOG.md | 12 ++++++++++++
1 file changed, 12 insertions(+)
create mode 100644 CHANGELOG.md
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..99c6c67
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,12 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.0.1] - 2020-10-08
+
+### Added
+
+- Alpha version of the utility
From 83326c998095de304bf4d8ee170b640fb9e9fc75 Mon Sep 17 00:00:00 2001
From: pgolo
Date: Thu, 8 Oct 2020 22:48:32 -0400
Subject: [PATCH 116/116] version 0.0.1
---
dist/pilsner-0.0.1-cp36-cp36m-win_amd64.whl | Bin 0 -> 499423 bytes
dist/pilsner-0.0.1-cp37-cp37m-win_amd64.whl | Bin 0 -> 499455 bytes
dist/pilsner-0.0.1-cp38-cp38-win_amd64.whl | Bin 0 -> 506415 bytes
dist/pilsner-0.0.1.tar.gz | Bin 0 -> 20672 bytes
4 files changed, 0 insertions(+), 0 deletions(-)
create mode 100644 dist/pilsner-0.0.1-cp36-cp36m-win_amd64.whl
create mode 100644 dist/pilsner-0.0.1-cp37-cp37m-win_amd64.whl
create mode 100644 dist/pilsner-0.0.1-cp38-cp38-win_amd64.whl
create mode 100644 dist/pilsner-0.0.1.tar.gz
diff --git a/dist/pilsner-0.0.1-cp36-cp36m-win_amd64.whl b/dist/pilsner-0.0.1-cp36-cp36m-win_amd64.whl
new file mode 100644
index 0000000000000000000000000000000000000000..c89f2566a8c290b30590a72e92d8f7d689254398
GIT binary patch
literal 499423
zcmV)PK()V6O9KQH0000801XpKQK+yXVJZLs05|{u022TJ0B~t+b8cmFFJE72ZfSI1
zUoLQYODoFHRnW`LPf5*D$jmLsFDg;+1#x(}(!esMC7C&yC6zF_PzaZo3jk0{0|XQR
z000O8*%L`olLgHFCBOjyiiZvW4*&oFaA|CFZe?;WZEs{{Y%XK$eQ9?ax02|0|B8<8
zNlYrVOj3)rB%VW4l&y)jCsK($NlqV%Y)QPPNQT|Ctw}t;{h_cl_N7T`$xJkJV>VDI
z6o5jZP$(2$u10T@Q8MjjNk6*0i<)=Y&3F`5*XlcKwVi17<+EomSEDzrw}&UYEb}q?}2Wd9T2E%k^HzG+mdDOj4P|C^#<)9kYZpZy(xKiN&>3BBnB~a?u
zJP~Km5-)LywaMK|p8Y%i^_3u5LHV;`5}PY}_VrooWM3m1H|byfiiB`^Vv9+5Oly)^j@X^^_eD89sxz;D~fdO*}szE;0nn6xpm$?gsNNbqaUK`mukJbRYiZ%Uw=g<^toQef6)&=
zs?gtkXDq)?ij>8s(5{*$f6NBcBxOyjuWfxF@s_Tyt$&XxOOw1Ko1|kJZw{lCYFyum
zYdb4yCaRi|0R&WQ`?d9hstR>*(YZKp#ruapA7aQFACn)_tdD@#ZeHhOGQPS>lWcs2
zTEXzJq*8V_Nh;B8a(kIf%TXyRrGx)Y;;bBf7nN}4YPnjDUc9CtQ5m&q)E`^{^5`3a
ztzY`(@MOJShF;8*j@vOx-9P=M4ZX)miDN`w4~;45QA&-NO3{l&di|uQ7GP-x@T+c`
z8LAMHSrhV>;F9}C@!|2=>3K)!r#!8Mx+F7{Xua16NU3RhX=C#cJ_(4=?mRi-0BYg8_}#_ZR{V40NG!`!
zqdc3JB(?H(2qh^WqmDQ`JUT>`9-hPiia6N^)b7C4@PIXm5F&6h+yn}C(ME#o!&xhC
zADwpEIvP3Of&gI$kfYPnpDxbKI3y5)#v-KFtDAgGwW=EJ4=2r>SYuTJ2&o9m!+a*1
zAhMyMy|>(>gzC99eo(g|hVN#mQmwbid`prh$3T{c?VY0NuC
ztnfEdf-)d${Cs-2AHO+0KR^AYwI83i4h*;qKx7nTRPFY~F=EliK`TCPG|x{>m^2tu
z15eHx&7b1q7WZHBHJlAUJ5Cq3ygoZ#DMgcBj#4={PhH8=A8Z&BoD@1p&!46g_L5
zZEWwv#}`MP!}#o7qYb=q*wN8g0BthjEIv3yBZ(9ri;&;|78Lu3%?_LH7PgT%AL6^V`5t%gyQcZaJ#HV98{*j|kR*s?_
z-16jQsNz~%y1cY*kzYBqPgSOdN>!@{D#2QHP#L4C9mkFTJU!2M%0EQwPhWkWa{x5S
zT0pishCv2*jD&*MDG<$TU)bvBL%g(Q(7Fv3FOYBZTSm9dAxFKeb*Av229B9N%tnw3@yReN;o+z>w;
zkFE!!Y?@4_V2#fuSo_`KL5F|7K}^NJjvMWtNY7$WH2$XtF_sNvsZU0;+vv|Ijwg3t
z;@(Zt`xt|DczhLq8V~xD=*OrQy^Knz-f~GQ9sT%YL?BB5xmqqqU-wL1OD(oH-8AlI
z*>rF@%S2^M$tis)V~xyEx;L3PFi5#ZEVQ4`TkXy{Vob#C%Ufa->qTYCvW>M$6sEp*tS3uAER;E`gi)JSl8fEkg7j(4aq*1oB~Tw{va*FOATA$IEE@htHRLhb#qlzEfC}otz0?@MFsImsF8gR&}xkI
z!PASk?=*qWX8@5k`8NofdbL)ot{<_%Fs?FJ-Qh61nT}`IH*^x|Dr8=L^y{wz{O`Zh
zh7B#a(JUNY@9q|@LNDg6>3TFPUXNNRuZ5{{R+U>E53y_K?N&*KmZZY$OrNu?9gm%S
zkz=5Dk)*~9!;`y{$uNm8XM(Nz#1WOvMqFX)=|Ad}I6Oge
zvt?_WQU4)D@MImLWJJ4j<resH-&>l(izBTO@x6zJu!|u6+vftDnzC$Zfbs@Z#Zh8Q^
z7bBX9S06*bKa3w#7n{!6!BOLFJ8r&fHGcvNcw3&sdcMAs6~M4#(Ba8`>%AEb&-bG_
zY5nr%;^06!CA7l$(HV|}Y#=%^um}=s&QYr^c9EPR71Sn)X!e1f8+Tj-Bw(*0<{hr3
zNEy$hMOVr~h_@wkDpPFF?oKcNo%FJ3wX~XZ^}%e^%g~yC&`n{7z+uv4cvTU~kUiCE
zcY2-T{Xt4R^$<6rl=5DYdVU#XH$NqJpU2aF8eFHSsu%oIsTHgE_!*`XG{8FXMbaZR
z!B;?Rt5#UdG??8)Ti!FZ7AT)1i<@8`D_x;aDq%_`p)N5SB(}_CVA7DF-?ln2o(Zf(
z^gpCT^QeLN`mdbPJBLm7)jVyzKWul{=TEHc}l_l;MS~e=1ishuC#icA2lX6TZHlHWS!$3IXIZMEx7NxDd73J
zJ55WyaXDWzm2^Y#gDINVaXZP5Q0mDHyoXCyQ@nJ-$~i*FLV(=*fJR=4X1q-@gbsHB
zq`!`;bJ7@*gz}5mS20P8&}`+z@lyipF%w!{+y}oZS)H3{((SjiF4{mfXVWS8z00Xc
zLX^s8$9Lf{AsD%md@;h-{-lq3(N|4%@irKTH-Jo<>{{jdB41ae6G0SbNIUPLUs+OC
zH&c*x(rWFu@iAOkxb`r)%W!FY@qw!^asebW*@8U4ZO8f9-m?@DKj>-ON8UXGQH%~)
zD3iM~zBvNF5D(_i1TsdcC9DQalWaB}Mb*7$U(I!KoN|>!(qI!Mu1Kmo!fO;uC~GpD
z9QQDYGNvS_ROuRlN(5o06Wf?23a?;38@fbZA|Tm{RO{#g>+8M=GbIzZosWgI3!FDo~tJcO{N5hW8er*bDaRy1mm@AN)0jyqZ~53}U9olQ%9^i6C9h+ZW>
z{%JJ+JVMILjR01`dxl24d3eY+M`cp}#p@dc>SQ=u#d5Zo=U6sh3wPf{YxddQ#b^Mg
zk@%;>lYN65`C031eP=yxoiyGYwe}_Fa7Hb|0Q%8~(m2;#kxKP38QuwX9JVH~QFe6|_#uv@@Mp@|lMw`i7gzVSB@6K6jnja-W
zAElVeU}dSpP;PB*u5Y4&tJP{7=zm3Ki~((tNmHWkP?nY!nM2H#;A6QG@h&V0l&zNg
z-o7)$2OJWI;Q(IrUy$ofQ>1s%s>pZzd!`Dxy8jDH9#W`=uBumM`
zG^>Q<6ON!tu$cgqZaTJ_i*`krGrzm;droB-c<(t0<^4!YkHNIM-SXvzWCQ&K1r;X@61}
z(>Ty9g%+BLDt;#Wjuc@~VL$|FsLmuv3@A*dmI(vTv9h_D5>*W0L@=EjC@e@~$t24x
z_<5$t@N@v%NhXf0#1&E&U23TQ14r8*&RQf9iL&nqxNVJb-tNz31zSR~CY2_CU{S%p
z>_Wy^kf$OImmw%H-492Zan@q~JsPt)t(GxP*}RNCWuXXV1Le$%70GkW<>=^4%Eoee
zu2%S9I=&@5WU7?s8Ved`wNTI`4p>P5GItHjHB(9s+ijHMTyw!xD=8LC%_SSDfzG@n
z$<+(BNz55bp|%#NhE`=DQK-sJ;stdp2EgNP`cW>Y=6Vq1NQ?-KNOFUvK1K3O+)>4r
z+3|R2nRwE?^V*!bxzo7+%0*zl$s1+KbR;ppN%vR5;ILoNio|ryGtz#bR|nTO!Q%z4
zKY{cEGbwQsSEU|f@OKzOnPC?AMoOV0iO;Q))mn$nl%8r>fjO3WvVXuQ4dA+gwpG}t
z=OehFfw=`fi8^eA1a-K?ZzN#v547umf9y+&yG~EO<+_^^hQodUy$P4*B<^EmHPX9L
zkJOzaE}QjWOvFTEDeR!vNHJWlKJfMX#p`SmgG)N(Q+H;?-`9buWT$P_RM9j^lj*0V
zUzTQv<-}a8?&RX=NL6C&1mHHX8I_X$;*knxqwe$$Mlb5##@)}|fp8l`&KsZ^Z_67c
zUue0n4odbU-)Odtp+V+^(>^7}9P1BYpW{a7UA*5qI6P^!c^9yoA!r|ULnGutV^1Kc
zA@9u(?_-owg5kOvj?p$A02uchk3KP&R1`Xnq~bG8X!C7y3n#JP;e-le-h+cmeoYb|
zt%}F@w3u`VnxAsU%@1!;%c;Y9o_w_b<|uCczZm_a?zK_BqQ*z~ALBWa2r%URFZIfC
zIp0a8DUxd1CiPuLi>e&Y7YgS;DWh#P56bhL;SuN8xfO8}dU2
zW@+?JY7QI6Fb;7nL-!c?SEV7S4J!SW=x7w<2#;b*{H?woXXCsm)FjR{sOmi&7{*N^
z#7KI6nMf5I0z8g~iTGfpxndJCI<&TgD(%LOwJ{9=8pk*Ws0pNTcBp~9T#Z`Y>F_S1
zvnYiip{z-#XRVXtGhv2}ZzufXRfNh?5E5Bb3Qcw1dXJ_tnobuT*!r~2J8(5zcE_R9
z%>bs}PtZZ~zEEWwKMP!)?9n@m_%hsT%05NURS{%0LYt*=A1N-<6*l}QAR46cjHo($
zN;0BBPQrT9UM?1iR@CWUwno|XZciOZB-t7lN1YgM<03_^b<#ZD#~NFkm~x_Oc!7(~
z!FCaNnuTxDn-$5RD$8LtMk_0t*)VtZ$@gTr5#TOMQgJ)xE!r6MVLxQWK=TzuGZ?hE
z*G;ohuZ)eA!ez>GPnEu)pAu-e-dcjs@)s58YLKNdXSmX*N<^Zw0=r=BU>Bv{#dO^O
zfHKmG`MDG=T}H5f{bd4vZ2(4tCLXU-!|Nx6m=qF103SEr=ZK}hPuPvw0FEe;(U-o-
zTLi0lUfXL!@xv2yZDExko|Nq!!Gjdod!l5fv`D{%_7V_UwtI9D9zoy16
z3Ebb^<=@o9l*W)aJn29k^o#V6AEgZ&wlNe(H`MU)zA;`oG-4@897&4ctyq&CU5yp=#VP<*-^x0!8Jt0;&kAK}j|ol(}#sG{d6x
z4$W1h20LfKqH0+)4Mo{%YVk{^L!h4z@-7w|vgf*bYCK*uw9l$K_)!Gw4ptL^n|UMM&v
zWvnn(t+@ij6-A{GMo=w6oZ3`^alQuUg3vh7`8eBzZx~AFa4&?~H|ldC4;it!5Qh!i
z9H?PhHwEO@jgGp=IHb6U&nn!4@bUG?ic?!eeR!F;Ysxd#EDgQbMDZwKbgV!sM&nvW
z^>BtJ_aCPVer{c$>Cx(pY3qip7zyg=$7n`u4WDn|!ITn7{{H)Nv}e-tOQh6`Xomh*
zBiAV$O{c%Px1uGxVr|mQ<=?|mpvC71>s00%tSI1w&2WmUWh)3ylF#SK6}RKla}D^e
zM8r!$f_9$7B&H~M4>CbkW)Dd
z+>ZB8FWwxrAbB@3;LvY_*eWxm>8hVx82?LbRrCbJir(cVr*a5qDB20C=fxL|7ijZ^
z^KoO_E%+6;4-UD
zXiuK@27^7l$Y3pc}4HQl?tZ2%o>MA6t6x|qdh~i!)
zTEXM7f{mutvT!UM#&?L`Dm(UtnJXZ`rys&-|5~eW|IQCpfKhw=^J+Xr9gfjOJqT`C
z-$p+`3E~*xPejV4@zN^u3b#EHtlwZyS^=s8L`sa14~v9L9TBy%OR2I(
z+5$q8J{^4tRmH$EehMye7#zk|#E(wO$7k=i$t^I6)Ifc|DVM`tP=-q&Rdt}w?po6+
zb8Xg)=P36}Fa8;^dlMJ33#3k2O^!g1mOFj|Q2bJ?5mk2T^?0FzA
zDa2Pru2@6>4+wzh*kxtjy9)^rCgLpFmvWUaYKjL#@Zlo4og1K2Sj}4{JpRPPqO46*
zIDr%)==88C7IRi2SKw@WUVnj3VvBih-?o=}%!U8yE!DD3@9({EyARKQ%-eiR<8+YC
zx(V<1*i4XD?L@3n0jk5~uH<2HtFXKu5oQ!Z)E5NYSvm<~0?;VlHk;AwsJ^zdNqt)M
z=%iR0L*y`ZqH{%{9}N8k@<0#^L)o;`B_{R+x6
ztN`rc=-+(wab>mE$G7cX8!_{Imk-iy{4h%=W)C>664_R1pl
zjxYZmzhVw(n;3O`4FBEp!;ze{H=ZOh0gllcH$t;zs+OK06@Pl2YA7~wU2(VHN813T
zngZt*EetVjAlwFOVt0CnR@%{cba#upL3_L#b=vO6_A@k&@K=T(?4vgUm$ge^1(}?Z
zq=)`1`a$|*#j$V(Z)+(bn?@WFei|I|)iEg4JD5p0!;bc(Pb#7JC?6O*pjO)!iBt&u
zk<8dNM55GBypQnojLhA>KxYjHFek>t+3hHuqnFI|*m^4)e@sTW%?K|l;_R?>Aaj5f
zrikkQAX)|D9>
z3-GDuoIr%`BislbBUR0uFpW;5eR^@;Y}v~TFlUXv3cbA?rm~Ap5E;~-xaaIA362&Ap<4!`E$_v#_8GwZaE}hko-zZmV#xi9wr0b
zwM5U?zD=J;-$te9&)5Dv9*j!U?x;V$UF+S92fd_3R{u3(8W<*7mQ2yk4Rq}R8oLMq
zUniwk*yk}&E_-MLlD?NVyH;Z#edFUmD%Iiv
z`0m?3kW-P>1INGTm6M&W57fn$lhB4vLDP^jhB6`^oZWz)4?^dK@AD2jQ
z*KY5R*#HUj>%mZZga(Z5VqWy8dWq=l?(N|b33*Ne$`kq!aEcQ_8^Q+*Iaq6y&ZbGJ
zyr-hlS|*eI;6?1^j%|0G4;BgBe|%1P7sZ
zBHl|>ytk9_-cE-wl4jpXiZ*VyI;CkP`a}ktODjG@vfBxHNg|&z(NvJqQPaz$Z
zVUqkS^M0?Yj@$P_K@kF)N3F&=o;!5*aQ6sWrQm{$;Lz-uGS$LRoj4QZz7{GnPpbth
z7HS6Fnw5QL&<2Z=K-~7tcxwTwo+h9Nv(02Wdilq}lso8eZY3+yIXK
z-9+si+uF!kozN>EKgU9~XWeOPuTZ!p0C1;tKgr;QmVEr&mmX<@7vS(SZNL6r`R$oJ
zp8&Ql0^`fQ04aLqk7l>=WSru?D@CEU4L>@j_F-dsjh+7wNBG{mw-XV3CcxV|5V|4(
zhwTdUB@qd|9gH;MZ@XW#w5#!K)Th)Q4brSRzMV8C6U158DRTDV=&;@4&qxSy*9Qi7b0qp9#!xK3K0b)iPD1{aQ;kQZiYh-+t9;l9`j{vE6*CCjidtc)x~n;Q~*bI7f)l+Vt_NsFJ9kv^QWgXcsW?ccX76l
zcAW%ev(bFlqR)uXKvunO?qbz#}1NYDqn`y(cijgpl@y2O@pgixTwULBHYLS29n
z9G#y2ba56R9>4>-hX;qP{nF2k^GXDnm5Ai#TWTiIdG|5*1b8EbQi^?SWO&uz2%s^M
zeez-vTTOpcr-Y@o@PS?F_1eqQ{vX;>jwCp--drb9x6z(*(5VLC7
zG$wU4%rCHTW)rCrVcCHTUbf3wMpR?6uop+iWvBrKlJyzttW?uQ1R9h^$eCbsl?K(I
zvAE`GbvZL|p(&H|P|hie%0zs0-wwx@-Jz^Ex#XTAZw+`#xE=KQq6eM0B_?w6*{fG!
zoe>ncEI(CbKqc0%T^6?M6oB6L%BJ6=yMk89SD!KHurhk`twh_kPnZ4jWKkj7TBV
z*trL7PgN(~qOQc3k)^lNqV5|jk(GXV_OB{}fosu*ij7)`Z&n2Mg1U1I9k^iho@=zh
z{U^&7<&9==;y20`>2>BHQp7iw8ESzP5}Q`u^_bt)}j|tef1jx@Ov4D
zZV=udh>u&xZ(8SgFC@M^Dj#kT6(Dq3lKYL*3xa
znfnA<;GWy*eGUMwzI?ky(9f4)e)>R?Z6Aa$^)rwEXQoYOX7F>ZsRtW!*av0jd
zp4#;zgUWD7E=KBNMv-|{emxR(>BnTfe%NSz&Yx+fkKqV_?bv0DltkJ
zBQ}i*$D(9nWc@ttPP$XDkXNEh_y-(Bdmq_)MR2Pzk9F+%c3LF{>tsNY7-Mx1Kmgk<
zmnbCKgO^Acf_9SSkwqc!01hEU0uMwh7!-%yjm9$^6aag%uZTx1+q5Mnl_j
z=JPikmYdv{00>*yk$~G1_z^#T=U!T^rStOSF2H{P{WUbwlV?<3SBgrb9VLj@32?`E2G``Gm|TFtPprK*Sv
zAP9%uqC~t}iI=I?Jpz)L+}}>3NU5IatAq$WLZFBN?N^boVW&={L~e7GL7TJbG#O<&
zKIoIvUJtVSrLv6c#cPzFAo&8q%`)JI76s7kB|}XS$D-Z+l$)JkF5~0h^Q3*L+pYF=YuF8|Tk4CusmdBcod>WPD=d^f~DaMO%>CWp!iW-cuHZ
ze4RNa{xJ(I3@WNpGM$#45M~ht#r&2;7d}Z;tVXJ+SjAP?b`18WM2<{lC?c>c
zOA=I=l;<5NmtLZ56(7{()hV^jfi~}5>=JC~UyJi_snEJ4e)YPul=&^UKX)g~((Vze
z5bYkomZ#kVwtE_7`wFhqzR0MvtfTSx<80ENB)vh`d^Sluvaz*+c)8i#a*Le|NmYm3
zX;7kUMyt~#kFp5|$~oIjh#z+TOt9w-BydK}!}&o#bkq)rs87QC0)lc`b)$CNrvUQh
zix|{}0187O&zQbLOgf;HO=NgR5+w+gk$lAaN!puUCmDV(_)2Ig0OALcF$DO2DuIlz
z^uWUT&8&n4?Swxq05zxm3P3AljKupv*+eY=%!nmdk4PzM+ibAl4b;A=c7^1rjz`?=
zmTuuIe7(he`$e1=;5gWy=e!CRgxI$Yi031|I>^qT=nV@R4;lyUE_OO48x<$h1bpqW
zJ(}Q-Cfz;>8Oo2YnunvTGqyLV$m?P;s?Yg@%C)=~Mt`FB^-Mk0nABcI_wgESJ~$;rKfsFaYNMF+nh@_y%EQAtFB
zL$4wT0ATO5U~k;0X7G=-(Iik|(uBlfnvGeK(-@t36bX9LeL$v%aaAqlVua1=*bzv*
zB?gMhRJcP0{%S^JyY=nfZ%n)0vkzw7nHsy;K2Gbvp
zp<`>Lh9kp|aG0$4#D&@-b|qwedK{qP>9ezdLd~-sD^S;6U7u+ex`AmUCK*b%BbDs7
zxO_w-W}WkHvGn48rAJJ3xkFoN7kZFp(KXLpTx#2>@<|xUHoqE*(#6iF!FuDYI~d}7
zA+Bqc`I6FM&?QN&Ta8k&(dq0G$A=dd8-#NzdwV48Fs(nBCfGYnadRS%C*&_Z&Ff@E
z(6k4)lcDL^D4FuB@K>ZH^g)Q(b;^glN-5v*=Kbwr^!U_
z(b!dHLZ~>_*!f82Wr7_oJyR0$BDe-#g!oC@da624PgR3@O1ve0k|h1(@if6N#iX6_
z#Rx+671XO_Bt)|kog~01MXawqViuNcfdol*qW{NjLxP1b@5ZbU;jv2jxP=ys*
zBKxuk!z7fc+cqb&5sB)EflAZ~6Kf+i`fnY(M$w#`ZW@0ajX#fK@~BH6uVy4yQCL|a
zd`Cun`jFIOz9=Z;7mLD;Lvq0bm&VUcNGP}xEVU`d+YMXGB>Gjq#ge7#DXx0C0=Lf%
z*AHa^uBkhdg#>4pdR0$TTNRxv_pGG}t2qg35_Kh(C@?&oV{YD}wW|GKpahs2Um_f!
zDOFI>%gc4Mv8bJso5a+Wi+q?@rg?nC_{3a|Ch>SWxQ>~{2-OH_v{L^??VSD?#G29o
zgc|2bqCl!nqGU2oi#A6XvpbWsr{62Wzo6zMFPFk~31_
z5`9&}{56RQ1VW*82csglH;A!@mg^h{O(8SGB!H4X*d7MH$TbJNTau8`*av_{e&{Jb
z=b$BjfaeDPgW=hYSUOrcOUWP9PT2Z?WxPRw#-_aPy3qUxt!b
zc4AV6Uy?FPP?b1
ztqQ9a%ggU(<2!b02~uD$XT7Uo_Zl$;?XM=jBA;WRhYx7GBGc^O6~=}_C1V|{QBS_#
zwn`qR(RPPRxzKhGWEr;mrdlXEf&_|2uCCIcW#a&QE_@DYmA^C|B(c)u50Ajo@RdlJ
z<;yI3-WMpcgqs1_-YIXQ@)OTJ5%v0fxYsSn4Hc2l&ECL!1Wb>;k$+#oK_0)2Fibur
zL#YZl>GRG~4l1w!d0|#$0>K9HuA2so0p~Xg@9urUfQ9!Lv|CTcuowm$04S~unwx_;
zY{RQ6`YRm0Q73RIaU6BDwq7w=2^Els>9W91T6*_{bPQ5xf9`-r;xHQlOomUOg@Zg2
zYYK-(|AaFY7Uez%C*u%1k)x9oL}C3z8#kMrC;eH^)w~p-;wc6baDx^B3J=AF{UMv<
z+uQL72XoLR20!X$ZBWXTWEgy6O%(_HQX5-Q+c#ZAjcudV{N<#H}U
zP~{h|aahhCDW?V1Rgibn6cwKXJ3L8*#@CL1pS)0*Qg=IE0}Y%KP!)aTFx77G3e_Ms
zWi`FU&tjF3Ds!Dz7Fq>(K%v5L>V&i)i;mn>un^6cp8gvI*=Ducs4AK!M1S_v19V
zA_rQ6%a;NMazfRt^O&o~47jpy&FqggJsqcC<1kXGF^{~fYRVw|WyvGlAv|`m$~v0w
z3G%L=+>LQ*nn$-sc{Y6SMbXQbXwRPXqszPK0DkcO$9Nj`#*;fRAFx8w#zSjaj+#C|
zYsvli*)#y9RfT8DqHCM6Kf0sQI3uCe$tK9elY1s%$SweKVEo!Eju>Q}65JIrH~v7W
zJh`G#0NLEIqN<4kY6v-GnNw&%nZSBN_7-SPqFvxHbbFo;mAE1?W)2U?!04uLjutGX
zs-)2>of$<@t>Ra8oY!p3U7O2Hfv5#+Qp_R=*6;bTVE;ijMFZ{*nT0dCe7~G_MUufn
zK2#`3)=!_6IB5&g70Fs&VY(VzR4kb=D`oseDuEPuU|c*|l$9*+&Qid1l6)>b4XHqv
ztfwFtaF-q->3ulrFjr7%KP5c1)x?jV9G=9T^G37v25z^j4<(@A2jgjuDZek(BTvd<
zz0QgayL1xBXg7cLURU+?YAkmeuwyN$S-M8hL~TE%{tUgF#A}k|@TA{pXko5F;6$iL
zFfGb;v_{$VPIzbB03z6%aAY8k9tZZ}N%jg>qhy3%_B1}?W~!6EE)VqMc%kqne6>=o
zfOFti!_2%Kq_CY6dLbnSihRxt8_*3?yzF1kwQkIG_C^8s9xIBBIihV$t
z<8!}nmVmE%xOS6jeKVA-Zj?lmU$^2LUw9{ugN0}g@k_%BZ{C*h!?!A4
z$o66uZMJ~3kzzVE<5@65`UbCr5!0xlgaf%;FE!tN47eezXtuZoqLUaelBd^`A*jI{fVyql8gRK3`}5bC-ccTG*GTMTB#
z9?vaKO0K!DP1ion-GK>ZRfK?r
zBpPcGzN4gBM?fq*@sXTQF1Mr+ziCdFx}e3HbuwqNOy@5@fCfjDoQie!>@d!q3r~i
znY?&gu>C}FFfHX9_Njh
zYm2_w)CwekGX=?u2W;-i9Q7jsI{qw_!-L2_Sv2(mqEu*`cnQ9QDYC-iutkkg9}j|3
zDDsWjcLeUmYf)gCuX8!6W6xEXuA>|4SYsl;64~>qW1s^8TLRc?I>yey(ZAmysL2>H
zU}9@Edyr?oo;G&?i7Dmj0AKsVoA0a$jys$+xd8I2?OQcH=J*Z&2ll1t?%GA%||D^VHTksT_{G{ijc5x8ILPa32;P
zLzym(UXC(W73AP1RU(<1NrsFG5xm7
zP6p`v8q4au-fWtJXMd5QRlzQy
z`RUzgJi5DG+FZ*yr@&yqo8e;LEN4M6H#B@<3GPw2`?6etMSIFGv5;&Ve@w`lMq0E$
z|H{1p3L@F!L?oa&@y&?9>>54qVXuhikr$A5z#4Ahl1#BWI83&Th^A%ir&?G8pUt2oB+d(Q#^Nr%uY4#VUY$w)kua3Rq(j&jdk
zk~d{ze1*~4zYGeE%P0pD4xtIu$N1OZ5mhTIE|5Nf={tc21n7STPN)fmnmD0S0=4#S
z>I94H6wVE`PM{(;)CPfi>4w@OP`|l5EPh47M(!@79XoLM?3i}P*xeh`O9D2%bb@`R
zV4vM!hzAK=9OG%|ZU}41{|r2B86}??TJ+O{HXZZ`F#6#HgxD1X5`H;BVKE}0@XOr=
z-K9g&MfwT_MZY|tqdo&2x%(Gwk8&o0-=1CoO^blTFDEEoatSQ{=>!I~
znA8{ixk&@8Dm#J@ez
zLpHzhPd6~AK@=E%dVr^c2?Ity-6IaCi*(50rxO?-1Q1~S(=BgtR4XcR@oy*i<=v#4
z((WU_-5hl@9`qPX;ooj>Y&W9d__tT8GaUne_#}0jQPRn8kEBk9eF{!~yO&se(Uq|{
z`SOX$HRj%`#F9RK#sqCQ(~$Zt0|zW2u(Oujq~o}xA~
zNlSjaXUNSg8&KgzetRd>=$g-{ky|9qCjET2lHX2n9JZN>B>Lej01p!NnWl_C(AxcN
z>RG?LmnmPk=!bjC5NiXSGvvclU^GgYHbg&s1u`VEsmP>Xo}~d_uwaV={_qq)12(>6
z1@MPw=1;liKtDW_Kk3cbaMKTG0lGyd1?Y!I^GY*B2~)oChi8eqzRnqze)v}|Chr24
zu0POvmC(%<{csl`W>Q*!d^ignA-$3FIQa@;^2=_T@Zn=$9wwaJaow4Hc~xy*3;Xg6
zdM2#lasKU*Yaf$4FfcK#f_=J$Gx{&sYgkq6+oMv-^rd0+<>on3F8_2FVkc3w5c~8f
z-91q~%|E?eAvG&c2v~s7%vRQ*$ajZlswMfMBRdK3o?5kZ(TqxcOe$`eWSN*Zki
zh}GmEHBsS5CE3G~jXvo1Do5bdNbpkfsrs-}*089l0Xtmo6(DYPd1?p>g+0~OaPffB
zBnk1`Z?}X++|mtPV!GE!jGr3FWt5ugMUiOBiJBUw{?hYuI#<=?0<)bh;RU99gFO9Y
zU@w^O;ryk$glSVYWO>JLK_UH#JGL`3xSAef_|Tg5KYE5&2FT~^q8N~K3XL@zsX~Ko
za8_bw5iKKNuuKaL8Lk4$4O6cT3olc&i!8t3WQ?%E6db0q?I)(0Yd?_O*J~lxAhQpg
zvZnU>su
zc3b(Gw0u6Uc8!n`Z}>R5JAxozpgRU>oZL>(_IotWZa^q!W7S|04blk1z(aEwD9Xl>
z*NV}^!LLX}i7bI55okD=B*5kuk4XDgvk|hl&_fG#*9$LF$&fP2BoEZF78umZV97
z_-&Moz&VLkM4%!9g=2^1P!U~){nbDZS?Uq0fi9Z6p2HFBqERlGC)f@%MN{lk8dtwm
z>BxvsQ4STAzD=V~DFmTQwl{Wan+UYj=ycB8@lUM}l?cKroVVKT);{|_>9kHd-4c(K$Uw>4!L$V&nV+!GMTA?9kur`KtJkwhsLV%G0*1mAB7Y
zO`4_FPg3Q&Ng6#e!H$N^&s`9Gx7|r4N(Vin`6knHIYh`2OB5^yNRDDV&Naf7yX_FD
z2t(muooJY`_Zxs!A_T`j@>rhq!2OAOsPR|lHwOsLQ!=B%E+o}KmpqV9Z-}S
z&H^9}s!Q=wR55Xnuv>Kqs1#040>E;sO#rO8+Y1E`gbi$zs?)ps5PTt>NCkV38x!?4
z@B)R*Nemm;J^AaN{0&e3mM8y}vty`Zgc~M&h7!LgH^}tbEyg4>wG)z^ce{ZU3z-9W
zMz>o|e8l<64WijPJmBNhgHSQig#hQ)qX1wJ5A%~K4*-Wic)u=wgyjLmil47i>>lGm
zM7$T=KtG7lKGyLpJxjB_5GZZ#xi
zxhBD-Jpeu-%-8HtI1h(YM=CeW>~`V=z}~$+2zuJ@1|wct9vJZB^ntj#dPW^RojAO+
zm3bGr2ZF@wa-gA>MGs6d9j~BvM~h%}*xj0evsWFmhLrXQBjQ2p1~D0JJ+*qax1RcP
zZwhxU%t6G7%{y>{TSF+EuSV>Y=!WsB<6$_rUytqMD&6aUpXfjxijDcaomW7Wn+5jMu1H_
z0U#lkoZMg30|gfq+;hk?#nNDtKrSO_N;PRUXz_X!FeL98+6f&CR
zhKL(v(DU>VIMusR5FdOG%+RTFrpz_p4dd?}#c{+79Augzz*E;6k7cXqhUi7+USvYn
z7q}Mb27r5UPT7(>J-C@b^^I`XsySG=!F|mnH<8>Rf)l5+y);y^1C{91=|uI}E|zhh
z$(}#XI#t&pEAYe`Ze*MJWH6;)iJhZjB0=Wz&>mWhH+OBUvU>=Hrst`1+62RGAEsP!
zOwElB@8~+UWz5y|MJwN+zVeS!`PRLJ=v$Pk?RyEyweGtB_la5b7H(MG8aHpb
zk@6NMbmIvH#ap;BTMrxmZee=2|96DCTX?aknxfkM6UT14_%dAoTZOFCL$WR00ULj_
z2y+X|l$zj|hxOcf70~(hUYUkEiS)KSEtrNYUk-JTsw?i~euR%SQUU=_ZI}eo9(9ar
z8?xi;W6dd*jHfnU8i9{GYW1qgFZYqg&O*#n8$k=1k30Lebx-QYnt>)#p4u21iG17<
zta3m5#~L}2geNy*B5{wajpUa-9!neH1U$7-5=eX8*|e$LRC%l^rP1-!Mog#hk=01~
z`q87QkqQ}4ZrD@;A9vL1;ytj(8Z()QCpTg;d5=8H(;p4^zJ1U~L&t!}>L
z`B>XE8#zyH6s?3l_DUyR27bJiPA222jh;r}>gX^QOJ2}BWWe{aYu8*7LMex
z#?wyFLyl+>Gcga3@@QsafrO{V_Y!fB>)=Q{&mL)zSnW@3d>CDiy!Ddf2|e1@OCjT_
z4VFgWK=Ec-_qhkJ`Nq*hB%8$8!z%NMm{&8BtyO}7;xRf
zP7d5otVF}{=sJ=!*k#G3v?QD0_X~|3SCSu3&BO}3;>Snnb<$!rQHo8^t}a$Tc^tLb
zMcvhJ9?V8qm!MrayG;-a^%2CyXwXA+#845rfffB2{aLL<-x0|CVpb)plK=l|y@-S6
zc3j)6*Vn6CH7igyo4{KNn?8VTA3)9iBrSum-x*>M_V7at!XAK#L3k;xdnv7ZDXn`c
zt$Qi0dnsM_Qo8P?blpqox}8$sMG$21jvu6EXEF|BzdOlc?7YZf>~W?zOaRUG0GjIo
zG}i-Yt_RRu51@HHfadi8n%4tpUPm+|seJ@dEC-|BaMn*!_Hs50qW%&KPKcqtTU&n?
ztw!*-va-^-8KhBf*iBOiL)(qO4?fCnx+In=!UJyMC}Y%*kofCDh@lGpLW1+t04lno
zQZffZG7|kx({XQrBf@?jWH$(9!lJXTp=yBwMnylliug4^yu7iCM?XG)UWwFOdpMC_
z)BEwDJ514{Ctl4XY31EW{QFn*gY?Jhr#(T!My3&2{s$w}>!V(>Mkyt|d4`pU=<^hU
znoZ3bn!Z{w4J~h^!C&6EszY2fJ@w<~erj5~gixdXanD8tBSMVYt2O$!(QhNxH4Q|>
zF8_o-w6;a7Y>ZZSqrnwphdwADT{QlRHi&)K&g)Os!;FB-m`z9
zdZ=;)ezb>2hbM32L-cWm6Q5Ddj*bqFZ%n#FNbz+GswElq2Um|au+g8-BF9y{2tT53
zf>1lp4$0?^9Y&l@42t&!+}|0ES-KGEmX;T&J-IkKk_-3b(mZ1dpF{&YPp(e7qtV@d
zf@ISawE;M2W=zM}cTo6|zbKcaFRt!toXcQFy165P5E4o%C_Y&0BvOon$QftAa&bAXBzI2Pjc(dLhj9U1>a>m}`YF*LEeQa$>F)!d%^r
zF41c8k*Cv(!n@HmY$plkqfJ0gblQZ3`60kx>A#deFm3w~r>~9pLgcFOk+1IN3BaNZ
z^J~;F0alxesA?i-C;@Mi>@Z7i+u2kgqY~*-Lg+41BMr~}r_|*na0F}Tix+!&(<9+m
z$Vfg+>r{E?ZDnnlt*k##E4k$Mwy(a-_N_lq`{ckCo))f$wvhcRtpH*h#6~E~1g5?_
zO_Ni6?Xx@li3Vpfw16+s*qlPhtmJD&g+C;-QjUJW^`Z(sypLPw=Ot92w$3metrM|R
zmRj&NO3-qa)5=7PYXSOaq4`@cG~~Yq?K&rVPl7OL*Llaukdk(t51S05XxBO002ELr
zwqpa@F}sD=2WBG(-T-mN`#8FJWNYiLph#`W!+~C`PQ!#e6UcK&NwO>6IQHDS1~YI#mH2s-KZHaS99x31d)0pFd2c>
zBD~Y`o!M8L8S47d);BzoR2_`-`0hSss^)k)o@HoE6CS*tUjau$IcvH$O(tcS$hS-RHX^7Wn&ogGSVg
zEcy{4c^-|Y(Kua$5T0u!?sCaeu&mXCmn?5B(et%$)92B*QR(^fwSSKXqY{ZJxkjSb
zmq^IDH7V478NaVIKzsBEN0%(UDx-x@{Bs%AJhTBx-=;4qQMLw<7mj?*D~t8QTE0tc
zcEy80%@h3$s{-EX?>Mx|O}euw0WcY5rh+xaX(hQIJB`cs!XE8oOS#e-upcbeJ>|KN
z1L=Un!Dx|kOstX@Sn%Rm!F`0=C#kaED6P#^Lqjht+6`xS_+gVWQzu|Z4o{lrt%H)5
z#aHW%jeNDP@zq*VdI_qxryW40Hi)SuVzV>}j)=^6C2yG=k57`%C5Dfch%d)|)TiZW
zo&|)ALM0OIQEE?%8y{snt)b0D07eZa2CX6TaH9h1{=(eBtXKD2O(e@aGoq!D%6_Pf%d~mKqFjJ!sOu^-
zOlUm~Aa>=O!1NJbZ~u>
zMOb)5H_4Prl_FA$b9MC7YuM>fg=c{VVUqyJD@so5P4>)_NOqSlxShCA9dl|3DSInh*DW{~$q
zCZYnnDZp-WSTk4G7vk!=i>o&TIyNkHkbwObV@q^R2-9f@FFPm=}km8ggfj;Uthe>zZ
z`qE4A5typh5!}Qbi4kMb`A~l|^R`@54{9iBOeohe&tB>u-$L7d`&YFDBrH
zwur-*zFVs~_j2L8L{3Ep@9D}3n`*)qL700fvGi9o{i504$P>5CweLLeKGW&qRvf`e0RpbL$hTvU(7
ziVMmy-~F?gzc}HZ$R;@RTo!xBKb;NGb^vu!^T)L9?67`M!g1b#EXE+gqbwKWfnPEB
zz{dyr;|MNpa6y}R!qhoF&Ty3RfyrE=Kb>ahm5m3L#Pf`FG43(3feeucpTz(ZAD^f%
zO1QYh4r}5Oz4<;4v6gTH$=snn5@+X)%|{X5p7AWk4u-*&0hGA5q>_PG7aX`5Vc;8k
z-YnmkYmnwld5j0HVcohbAX9
z307oKiVd)H#AHx8oGJ#EUTXJ^F{jZz)Mhx-h|3+}8*n4E9Y7v#PLylZckWU2g^pXX
z@edhPg-~O3T|zK%Ef5}>^zj6v+RYIrC=)-3o#NvL1FS7u<^=wtxRnpqtM@8M3yxqh
zhA2F|N^vQAFDcf|y5wDJA2JyIH~tYT^sIEk4dLLfZjk?vUtmu}JN{Me!oTT99RY
zV;=cd28{yG3gLt|t`x@RdshrsDL>`r~^FUfm(D
z*uxn{I3q_$t@Cn%3EB>zjC@X%Ys7T}PSCt6(8u#F1eU8Mc6O8Rd-z6Hat$zysGRSb
z+e({5C%3s+0Hi#~aoshzn{&;GVgs`1@aCiH88P>G%`sZ}m7d}wrW6f?mPgJtZn}wO
zxnmYOEb~iS*(3zWBHxr;_5c?j*P!7HA4PpL?&kjRo}6#asn%CG%@3G(lBbkORIZ1P
zjog&b@iCV211fTI-8E7>^YC(bC(l2P%oAAkk$5KF!BKm~M3AE|HuN1r0RJpPeK0fs
zEiTN$K!DrkH>KRWm__l#?!~Py-Co8yU5$BC{_C%jJr^iq*Y6PE*5z?UlSd^l^_t9|hr?n8pJ
zBKY)ZKBYDLJqhAQWkbAYmO7}ARpKalPk5=*$h{B_4pj%M$^dB3`b*eQ&)~p}U{g11}SCM`-dTxhJ}=NVqRZ)XT~ZWZ(4&
z<^u1z95Kh$2p){ab1}kwEy8*!qOM(v06+Mz%4GIlhfuCG)P?&HmP-&PW2u1KpIsV;
zDjQv$lWGfaQq3kqE%yaj!UpLd(m=K%w*HuPhx~p>zK~#*jR+sSJe{=4>a^f^4PlsE
z?NNxXmG@MEOu8+v02mVwxkXCr?oy8-p&v>J7g`A(UP6afA1yg12)V6FI=xiW?~QGG
z*}uSAfu{$a7_I+jofw$m9G>4j6Yu?O
z#h2spFuoj*;kS6ceyqCc7?B*2Oh+Y`>MLFQAUzur8uv`_=<=0EIr7WCgBw7NbyI
z=RyRu*q#ubM!675hq3VD>W*52U*%zSJMV-psQU0;a@-5`sX%I1
zd(2M9%swlN-#njh74^>88eWB%+WMcsVv;5*il+PT&UrN(-U4be721&UMm)t~
zz{nIssETbaW2l9gV#NbXSd5~&T3yc#qwr@D?VJBcgz{s_U8zFXV!(uQD24g5Y7pdP
znv(Fq5Ljuwz+Bv1a5MMHBsaEEVy*tlBoNizJpDJ+-y>L17HgNj?rfV(566x7ar0f{
zJpQ?HbkPz@N3VH7QDyfR+~o!JuKix`FWAi>UVGblXO;5ivmf|lOwyLad}D3DD#>!g
z$SH!=aOAyV;11M;nu|QOq>Dt^(rKt}AqL|j`X45{FD9!OU^4rV3aS*wDuhLPRT28=
zAqKLTTIKG!;*L5N8iGo>0i9>DmQnlawD|vXF8jn3B%(%0@hA75@u0P)F?{uu_Oqcx
zvG=MUY3p=IF@s;glC{lC8YaA0Qu|hieH4d%;6`4kRE?FO(338-x?^@
zw32NXJd>92=9bSkd9cV716q3+!*tI^T;`3&Q3sj+{k@%icJ7y3iZjGK{AK1A1EhEf
zDJSsd5(m}XGD!@f<^C8~{^w(&Z3fHpqd^
z>LbYkxo3THz(wW)a$r+)$6d1M4&sG*=6(T2F%;KBN@yr#kb7E%HZ9Jr_s)IdAb5Gt
zLF6L5r`-lzidbKSS>|BGc{s^{#uU0nL!<&3nbNIydVHdnCfuYR_cGW*{&aQ*(_H87
zZ!ua1XUxHU^RkVEYOOaOjS}V#lKwFqWXU>9ELrr#0_>&WnN8F2l+nhq(u|UsA;}_rILhLK>G(Dtj55m}UTtK7RXp_B
zjb-z=Y4E&U?jUv%F4w}bJwBT*ixUg7djOltC3w7o__jL{_ooHDQ)!dJPc=7<_WZy*
z9tB-fX-|ZErxqYiydzrV+JM<3bs=Tv0Xkobz`e*DBoCt@j^~2Ycdb%`G-nEKp|w3_+?^|%7B1llEJ?*zonniac}bFC(m^l2oDGKk1lK$0
zt-T>%CE?`>@l^%s`3dPk1x0fUSzYu2mKGb_>H=%cR^hdVPqZamb~H%h`n;03ey@_)
zau@R6Bym0Lvg5j6{>qmfT@UmuBuT?L>7gVkoeLf*DnjWQb4yP>4(vnj2fb3Qm{?q{
z$jfmf*2>>U0S}DlyN<2wLiXlYb1`ROmXkZ$$9hI+JHXtms7K6M)?V%2xFnc8jVvUU
z#-t6$-F_*594vU3S(A2|gI)oSD|*geY=7=`*=
zi+dPrnPjpzqM#s^d;(bqwd4w-xUOo&PEoAKlz#XfheLJ-i(R{5jg%$aAE82g!j7cy
zoA^aTRBRd?ONs@a*wVv2Kcx2K>tXK*F&yx8Vn0c<>G%#Smt}s#FFcS|Uab^Bixcx8
zfwS`J+yS$;jIJV4RQfa?^vk7^{NUoJtb0ikZ0|0X<=M}RAv*Awi1`(RGqgZCrgzW9
zcK?SYgTsILpZ7=g4h?yK)cGDT;l}(;-+7-0mbX94dJ}j@$dljwrQESbxC1wRO?U;9
zASn76F!aI;VfQ3G4!n#Pbhql^lffdV3vtE}@l_g$#=vQEJN^W-11WuoJEwOM%#6!!
znsc{AnYj4_Wb`;0ns;`GCd?@MJnc>5qH1itDuEU%@n8w43&LLy5(3AIkx#@)nJkudyJi
zSLp8*{DgS(fLZ=RJFF6c4K+SFZJfVtSHj!-b?@N^30qy&vdSCcmIs36O!>)M)|(~D
zfMxpjtvHuy)2u_nx232~FGJ-Boz^1H=8x2AIjn-#g&QHZ!rhJj3Cy|JS=ImY
zz?=m*sd!+{Fmsk&)NDSCiZ_qU2$n;^0!5CJe=p$(w;s^ft5_Jq0+cP+(QB!p2j}Qt
zLn|z{;eQ19qYCh+6GreJ7J;R>etDxaD_sy8GSFzM~W{M-jzqT
z*uhRHR?Xh`xyBfsUz}2Kp>>AzR)LkK2z~20eWCnrb_-jclWtMxFt2;qf(~L{7crmQ
zF61W0Lco5NdOFBi%<8rqB6f>;^9F{K2y!_Km*GYDy5{0*OEB63>ijET@0y{T-{399vu?x47Nd~q;r!L&qNSY}&O7`gx-87bRB(ps?C+F-Zs}x1p
zZ$C+U*l0VRs?3#5?{=4Y*s{r}eCWfL4O%fdY>`?#=TQo43O-7qzl$EFFdRjXQdn!|
zIZ9b{2=3`9h4tnlM=7jZ{^mz1c{Bc39i`~G8_wKKXYQ8qSmmM4WcYMorx&WottUW?Y9xLCWugAio}}VP1C{kNXTws#E6I5
zU9@C|$PJWPBbq#_g4
zvCs72$eYgXxBogUaVEwzS~ujI>c-_2k1ewv5Kweei>kR5E$s!CP4f#ZTh6Warm&Ya
zY__)+e}P4;!Ls4rvV#