Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 2.29.0 #759

Merged
merged 18 commits into from
Dec 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,18 @@ Versioned according to [Semantic Versioning](http://semver.org/).

## Unreleased

Changed:

* `ocrd_utils.make_file_id`: combine with output fileGrp if input has pageId, but don't extract numbers, #744
* `OcrdMets.add_file`: `mets:fileGrp/@USE` must be valid `xs:ID`, #746

Added:

* `ocrd ocrd-tool`: wrap `list-resources` and `show-resource` from `Processor`
* bashlib `ocrd__parse_argv`: add `--list-resources` and `--show-resource`, #751
* `ocrd bashlib`: wrap `input-files` from `Processor` and `make_file_id`
* bashlib `ocrd__wrap`: offer `ocrd__files` and `ocrd__input_file`, #571

## [2.28.0] - 2021-11-30

Added:
Expand Down
13 changes: 13 additions & 0 deletions ocrd/bashlib/src/dumpjson.bash
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,16 @@ ocrd__dumpjson () {
ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" dump
}

##
## Output file resource content.
##
ocrd__show_resource () {
ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" show-resource "$1"
}

##
## Output file resources names.
##
ocrd__list_resources () {
ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" list-resources
}
2 changes: 2 additions & 0 deletions ocrd/bashlib/src/parse_argv.bash
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ ocrd__parse_argv () {
-l|--log-level) ocrd__argv[log_level]=$2 ; shift ;;
-h|--help|--usage) ocrd__usage; exit ;;
-J|--dump-json) ocrd__dumpjson; exit ;;
-C|--show-resource) ocrd__show_resource "$2"; exit ;;
-L|--list-resources) ocrd__list_resources; exit ;;
-p|--parameter) __parameters+=(-p "$2") ; shift ;;
-P|--parameter-override) __parameter_overrides+=(-P "$2" "$3") ; shift ; shift ;;
-g|--page-id) ocrd__argv[page_id]=$2 ; shift ;;
Expand Down
16 changes: 16 additions & 0 deletions ocrd/bashlib/src/wrap.bash
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,20 @@ ocrd__wrap () {

ocrd__parse_argv "$@"

i=0
declare -ag ocrd__files
while read line; do
eval declare -Ag "ocrd__file$i=( $line )"
eval "ocrd__files[$i]=ocrd__file$i"
let ++i
done < <(ocrd bashlib input-files \
-m "${ocrd__argv[mets_file]}" \
-I "${ocrd__argv[input_file_grp]}" \
-O "${ocrd__argv[output_file_grp]}" \
${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-})
}

# usage: pageId=$(ocrd__input_file 3 pageId)
ocrd__input_file() {
eval echo "\${${ocrd__files[$1]}[$2]}"
}
55 changes: 55 additions & 0 deletions ocrd/ocrd/cli/bashlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,27 @@
"""
from __future__ import print_function
import sys
from os.path import isfile
import click

from ocrd.constants import BASHLIB_FILENAME
import ocrd.constants
import ocrd_utils.constants
import ocrd_models.constants
import ocrd_validators.constants
from ocrd.decorators import (
parameter_option,
parameter_override_option,
ocrd_loglevel
)
from ocrd_utils import (
is_local_filename,
get_local_filename,
initLogging,
make_file_id
)
from ocrd.resolver import Resolver
from ocrd.processor import Processor

# ----------------------------------------------------------------------
# ocrd bashlib
Expand Down Expand Up @@ -61,3 +75,44 @@ def bashlib_constants(name):
print("[%s]=%s" % (key, val[key]), end=' ')
else:
print(val)

@bashlib_cli.command('input-files')
@click.option('-m', '--mets', help="METS to process", default="mets.xml")
@click.option('-w', '--working-dir', help="Working Directory")
@click.option('-I', '--input-file-grp', help='File group(s) used as input.', default='INPUT')
@click.option('-O', '--output-file-grp', help='File group(s) used as output.', default='OUTPUT')
# repeat some other processor options for convenience (will be ignored here)
@click.option('-g', '--page-id', help="ID(s) of the pages to process")
@click.option('--overwrite', is_flag=True, default=False, help="Remove output pages/images if they already exist")
@parameter_option
@parameter_override_option
@ocrd_loglevel
def bashlib_input_files(**kwargs):
"""
List input files for processing

Instantiate a processor and workspace from the given processing options.
Then loop through the input files of the input fileGrp, and for each one,
print its `url`, `ID`, `mimetype` and `pageId`, as well as its recommended
`outputFileId` (from ``make_file_id``).

(The printing format is one associative array initializer per line.)
"""
initLogging()
mets = kwargs.pop('mets')
working_dir = kwargs.pop('working_dir')
if is_local_filename(mets) and not isfile(get_local_filename(mets)):
msg = "File does not exist: %s" % mets
raise Exception(msg)
resolver = Resolver()
workspace = resolver.workspace_from_url(mets, working_dir)
processor = Processor(workspace,
ocrd_tool=None,
page_id=kwargs['page_id'],
input_file_grp=kwargs['input_file_grp'],
output_file_grp=kwargs['output_file_grp'])
for input_file in processor.input_files:
for field in ['url', 'ID', 'mimetype', 'pageId']:
# make this bash-friendly (show initialization for associative array)
print("[%s]='%s'" % (field, getattr(input_file, field)), end=' ')
print("[outputFileId]='%s'" % make_file_id(input_file, kwargs['output_file_grp']))
26 changes: 20 additions & 6 deletions ocrd/ocrd/cli/ocrd_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
import click

from ocrd.decorators import parameter_option, parameter_override_option
from ocrd.processor import generate_processor_help
from ocrd.processor import Processor
from ocrd_utils import (
set_json_key_value_overrides,
VERSION as OCRD_VERSION,
parse_json_string_with_comments as loads
)
set_json_key_value_overrides,
VERSION as OCRD_VERSION,
parse_json_string_with_comments as loads
)
from ocrd_validators import ParameterValidator, OcrdToolValidator

class OcrdToolCtx():
Expand Down Expand Up @@ -93,10 +93,24 @@ def ocrd_tool_tool(ctx, tool_name):
def ocrd_tool_tool_description(ctx):
print(ctx.json['tools'][ctx.tool_name]['description'])

@ocrd_tool_tool.command('list-resources', help="List tool's file resources")
@pass_ocrd_tool
def ocrd_tool_tool_list_resources(ctx):
Processor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name],
list_resources=True)

@ocrd_tool_tool.command('show-resource', help="Dump a tool's file resource")
@click.argument('res_name')
@pass_ocrd_tool
def ocrd_tool_tool_show_resource(ctx, res_name):
Processor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name],
show_resource=res_name)

@ocrd_tool_tool.command('help', help="Generate help for processors")
@pass_ocrd_tool
def ocrd_tool_tool_params_help(ctx):
print(generate_processor_help(ctx.json['tools'][ctx.tool_name]))
Processor(None, ocrd_tool=ctx.json['tools'][ctx.tool_name],
show_help=True)

# ----------------------------------------------------------------------
# ocrd ocrd-tool tool categories
Expand Down
32 changes: 32 additions & 0 deletions ocrd/ocrd/lib.bash
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,20 @@ ocrd__dumpjson () {
ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" dump
}

##
## Output file resource content.
##
ocrd__show_resource () {
ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" show-resource "$1"
}

##
## Output file resources names.
##
ocrd__list_resources () {
ocrd ocrd-tool "$OCRD_TOOL_JSON" tool "$OCRD_TOOL_NAME" list-resources
}

# END-INCLUDE
# BEGIN-INCLUDE ./src/usage.bash
## ### `ocrd__usage`
Expand Down Expand Up @@ -122,6 +136,8 @@ ocrd__parse_argv () {
-l|--log-level) ocrd__argv[log_level]=$2 ; shift ;;
-h|--help|--usage) ocrd__usage; exit ;;
-J|--dump-json) ocrd__dumpjson; exit ;;
-C|--show-resource) ocrd__show_resource "$2"; exit ;;
-L|--list-resources) ocrd__list_resources; exit ;;
-p|--parameter) __parameters+=(-p "$2") ; shift ;;
-P|--parameter-override) __parameter_overrides+=(-P "$2" "$3") ; shift ; shift ;;
-g|--page-id) ocrd__argv[page_id]=$2 ; shift ;;
Expand Down Expand Up @@ -209,6 +225,22 @@ ocrd__wrap () {

ocrd__parse_argv "$@"

i=0
declare -ag ocrd__files
while read line; do
eval declare -Ag "ocrd__file$i=( $line )"
eval "ocrd__files[$i]=ocrd__file$i"
let ++i
done < <(ocrd bashlib input-files \
-m "${ocrd__argv[mets_file]}" \
-I "${ocrd__argv[input_file_grp]}" \
-O "${ocrd__argv[output_file_grp]}" \
${ocrd__argv[page_id]:+-g} ${ocrd__argv[page_id]:-})
}

# usage: pageId=$(ocrd__input_file 3 pageId)
ocrd__input_file() {
eval echo "\${${ocrd__files[$1]}[$2]}"
}

# END-INCLUDE
4 changes: 3 additions & 1 deletion ocrd_models/ocrd_models/ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,9 @@ def add_file(self, fileGrp, mimetype=None, url=None, ID=None, pageId=None, force
if not fileGrp:
raise ValueError("Must set fileGrp of the mets:file")
if not REGEX_FILE_ID.fullmatch(ID):
raise ValueError("Invalid syntax for mets:file/@ID %s" % ID)
raise ValueError("Invalid syntax for mets:file/@ID %s (not an xs:ID)" % ID)
if not REGEX_FILE_ID.fullmatch(fileGrp):
raise ValueError("Invalid syntax for mets:fileGrp/@USE %s (not an xs:ID)" % ID)
el_fileGrp = self._tree.getroot().find(".//mets:fileGrp[@USE='%s']" % (fileGrp), NS)
if el_fileGrp is None:
el_fileGrp = self.add_file_group(fileGrp)
Expand Down
15 changes: 8 additions & 7 deletions ocrd_utils/ocrd_utils/str.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,24 +62,25 @@ def make_file_id(ocrd_file, output_file_grp):
Derive a new file ID for an output file from an existing input file ``ocrd_file``
and the name of the output file's ``fileGrp/@USE``, ``output_file_grp``.
If ``ocrd_file``'s ID contains the input file's fileGrp name, then replace it by ``output_file_grp``.
Else if ``ocrd_file``'s ID contains the input file's pageId, then merely append ``output_file_grp``.
Otherwise use ``output_file_grp`` together with the position of ``ocrd_file`` within the input fileGrp
(as a fallback counter). Increment counter until there is no more ID conflict.
(as a fallback counter), and increment counter until there is no more ID conflict.
"""
ret = ocrd_file.ID.replace(ocrd_file.fileGrp, output_file_grp)
if ret == ocrd_file.ID:
m = re.match(r'.*?(\d{3,}).*', ocrd_file.pageId or '')
if m:
n = int(m.group(1))
if ocrd_file.pageId and ocrd_file.pageId in ocrd_file.ID:
# still sufficiently unique
ret = output_file_grp + '_' + ocrd_file.ID
else:
ids = [f.ID for f in ocrd_file.mets.find_files(fileGrp=ocrd_file.fileGrp, mimetype=ocrd_file.mimetype)]
try:
n = ids.index(ocrd_file.ID) + 1
except ValueError:
n = len(ids)
ret = concat_padded(output_file_grp, n)
while next(ocrd_file.mets.find_files(ID=ret), None):
n += 1
ret = concat_padded(output_file_grp, n)
while next(ocrd_file.mets.find_files(ID=ret), None):
n += 1
ret = concat_padded(output_file_grp, n)
if not REGEX_FILE_ID.fullmatch(ret):
ret = ret.replace(':', '_')
ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret)
Expand Down
2 changes: 1 addition & 1 deletion ocrd_utils/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='ocrd_utils',
version='2.28.0',
version='2.29.0',
description='OCR-D framework - shared code, helpers, constants',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
Expand Down
2 changes: 1 addition & 1 deletion repo/spec
Submodule spec updated 4 files
+9 −3 CHANGELOG.md
+4 −0 README.md
+0 −364 glossary.de.md
+1 −1 mets.md
6 changes: 6 additions & 0 deletions tests/model/test_ocrd_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,5 +255,11 @@ def test_merge(self):
self.mets.merge(other_mets, fileGrp_mapping={'OCR-D-IMG': 'FOO'})
assert len(self.mets.file_groups) == 18

def test_invalid_filegrp(self):
"""https://github.com/OCR-D/core/issues/746"""
mets = OcrdMets(content="<mets></mets>")
with self.assertRaisesRegex(ValueError, "Invalid syntax for mets:fileGrp/@USE"):
mets.add_file('1:! bad filegrp', ID="foo123", pageId="foobar")

if __name__ == '__main__':
main(__file__)
16 changes: 13 additions & 3 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,9 +289,19 @@ def test_make_file_id_570(self):
def test_make_file_id_605(self):
"""https://github.com/OCR-D/core/pull/605"""
mets = OcrdMets.empty_mets()
f = mets.add_file('1:!GRP', ID='FOO_0001', pageId='phys0001')
f = mets.add_file('2:!GRP', ID='FOO_0002', pageId='phys0002')
self.assertEqual(make_file_id(f, '2:!GRP'), 'id_2_GRP_0002')
f = mets.add_file('GRP1', ID='FOO_0001', pageId='phys0001')
f = mets.add_file('GRP2', ID='FOO_0002', pageId='phys0002')
self.assertEqual(make_file_id(f, 'GRP2'), 'GRP2_0001')

def test_make_file_id_744(self):
"""
https://github.com/OCR-D/core/pull/744
> Often file IDs have two numbers, one of which will clash. In that case only the numerical fallback works.
"""
mets = OcrdMets.empty_mets()
f = mets.add_file('GRP2', ID='img1796-97_00000024_img', pageId='phys0024')
f = mets.add_file('GRP2', ID='img1796-97_00000025_img', pageId='phys0025')
self.assertEqual(make_file_id(f, 'GRP2'), 'GRP2_0002')

def test_generate_range(self):
assert generate_range('PHYS_0001', 'PHYS_0005') == ['PHYS_0001', 'PHYS_0002', 'PHYS_0003', 'PHYS_0004', 'PHYS_0005']
Expand Down