Skip to content

Commit

Permalink
Added support of BSON files to analyze command
Browse files Browse the repository at this point in the history
  • Loading branch information
ivbeg committed Jan 31, 2022
1 parent e91821c commit 50c9138
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 39 deletions.
4 changes: 2 additions & 2 deletions HISTORY.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ History
=======

1.0.12 (2022-01-30)
-------------------
* Added command "analyze" it provides human-readable information about data files: CSV, JSON lines, JSON, XML. Detects encoding, delimiters, type of files, fields with objects for JSON and XML files. Doesn't support Gzipped, ZIPped and other comressed files yet.
-------------------
* Added command "analyze" it provides human-readable information about data files: CSV, BSON, JSON lines, JSON, XML. Detects encoding, delimiters, type of files, fields with objects for JSON and XML files. Doesn't support Gzipped, ZIPped and other comressed files yet.

1.0.11 (2022-01-30)
-------------------
Expand Down
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ Analyzes data format and provides human-readable information.
Returned values will include:

* Filename - name of the file
* File type - type of the file, could be: jsonl, xml, csv, json
* File type - type of the file, could be: jsonl, xml, csv, json, bson
* Encoding - file encoding
* Delimiter - file delimiter if CSV file
* File size - size of the file, bytes
Expand Down
65 changes: 29 additions & 36 deletions undatum/cmds/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# -*- coding: utf8 -*-
# FIXME: A lot of unoptimized code here, it could be better, shorter and some functions could be improved
import os
from ..utils import get_file_type, get_option, dict_generator, guess_int_size, guess_datatype, detect_delimiter, detect_encoding, get_dict_value
from ..utils import get_file_type, get_option, dict_generator, guess_int_size, guess_datatype, detect_delimiter, detect_encoding, get_dict_value, get_dict_keys, _is_flat, buf_count_newlines_gen
from ..constants import SUPPORTED_FILE_TYPES
from collections import OrderedDict
import bson
Expand All @@ -11,41 +13,6 @@
import xmltodict
OBJECTS_ANALYZE_LIMIT = 100

def buf_count_newlines_gen(fname):
def _make_gen(reader):
while True:
b = reader(2 ** 16)
if not b: break
yield b

with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count


def get_dict_keys(iterable, limit=1000):
n = 0
keys = []
for item in iterable:
if limit and n > limit:
break
n += 1
dk = dict_generator(item)
for i in dk:
k = ".".join(i[:-1])
if k not in keys:
keys.append(k)
return keys


def _is_flat(item):
"""Measures if object is flat"""
for k, v in item.items():
if isinstance(v, tuple) or isinstance(v, list):
return False
elif isinstance(v, dict):
if not _is_flat(v): return False
return True


def analyze_csv(filename, objects_limit=OBJECTS_ANALYZE_LIMIT):
Expand Down Expand Up @@ -98,6 +65,30 @@ def analyze_jsonl(filename, objects_limit=OBJECTS_ANALYZE_LIMIT):
return report


def analyze_bson(filename, objects_limit=OBJECTS_ANALYZE_LIMIT):
"""Analyzes BSON file"""
report = []
report.append(['Filename', filename])
report.append(['File type', 'bson'])
report.append(['Filesize', str(os.path.getsize(filename))])
f = open(filename, 'rb')
flat = True
objects = []
n = 0
for o in bson.decode_file_iter(f):
n += 1
objects.append(o)
if n > objects_limit:
break
f.close()
for o in objects[:objects_limit]:
if not _is_flat(o):
flat = False
report.append(['Is flat table?', str(flat)])
report.append(['Fields', str('\n'.join(get_dict_keys(objects)))])
return report


def analyze_json(filename, objects_limit=OBJECTS_ANALYZE_LIMIT, filesize_limit=500000000):
"""Analyzes JSON file"""
report = []
Expand Down Expand Up @@ -275,6 +266,8 @@ def analyze(self, filename, options):
table = analyze_csv(filename)
elif filetype == 'jsonl':
table = analyze_jsonl(filename)
elif filetype == 'bson':
table = analyze_bson(filename)
elif filetype == 'json':
table = analyze_json(filename)
elif filetype == 'xml':
Expand Down
36 changes: 36 additions & 0 deletions undatum/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,39 @@ def guess_datatype(s, qd):
attrs = {'base' : 'empty'}
return attrs


def buf_count_newlines_gen(fname):
def _make_gen(reader):
while True:
b = reader(2 ** 16)
if not b: break
yield b

with open(fname, "rb") as f:
count = sum(buf.count(b"\n") for buf in _make_gen(f.raw.read))
return count


def get_dict_keys(iterable, limit=1000):
n = 0
keys = []
for item in iterable:
if limit and n > limit:
break
n += 1
dk = dict_generator(item)
for i in dk:
k = ".".join(i[:-1])
if k not in keys:
keys.append(k)
return keys


def _is_flat(item):
"""Measures if object is flat"""
for k, v in item.items():
if isinstance(v, tuple) or isinstance(v, list):
return False
elif isinstance(v, dict):
if not _is_flat(v): return False
return True

0 comments on commit 50c9138

Please sign in to comment.