Skip to content

Commit

Permalink
Improve DFXP parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
mantas-done committed Jun 15, 2023
1 parent c9a5258 commit 6bbdbaa
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 76 deletions.
32 changes: 1 addition & 31 deletions src/Code/Converters/DfxpConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,7 @@ public function canParseFileContent($file_content)

public function fileContentToInternalFormat($file_content)
{
preg_match_all('/<p.+begin="(?<start>[^"]+).*end="(?<end>[^"]+)[^>]*>(?<text>(?!<\/p>).+)<\/p>/', $file_content, $matches, PREG_SET_ORDER);

$internal_format = [];
foreach ($matches as $block) {
$block['text'] = preg_replace('/<br\s*\/?>/', '<br>', $block['text']); // normalize <br>
$internal_format[] = [
'start' => static::dfxpTimeToInternal($block['start']),
'end' => static::dfxpTimeToInternal($block['end']),
'lines' => explode('<br>', $block['text']),
];
}

return $internal_format;
return (new TtmlConverter())->fileContentToInternalFormat($file_content);
}

public function internalFormatToFileContent(array $internal_format)
Expand Down Expand Up @@ -74,22 +62,4 @@ protected static function internalTimeToDfxp($internal_time)
{
return ($internal_time * 10000000) . 't';
}

protected static function dfxpTimeToInternal($dfxp_time)
{
if (substr($dfxp_time, -1) === 't') { // if last symbol is "t"
// parses 340400000t
return substr($dfxp_time, 0, -1) / 10000000;
} else {
// parses 00:00:34,040
$parts = explode(',', $dfxp_time);

$only_seconds = strtotime("1970-01-01 {$parts[0]} UTC");
$milliseconds = (float)('0.' . $parts[1]);

$time = $only_seconds + $milliseconds;

return $time;
}
}
}
63 changes: 34 additions & 29 deletions src/Code/Converters/TtmlConverter.php
Original file line number Diff line number Diff line change
Expand Up @@ -17,36 +17,38 @@ public function fileContentToInternalFormat($file_content)
$array = array();

$body = $dom->getElementsByTagName('body')->item(0);
if ($body) {
$div = $body->getElementsByTagName('div')->item(0);
if ($div) {
$pElements = $div->getElementsByTagName('p');
foreach ($pElements as $p) {
$begin = $p->getAttribute('begin');
$end = $p->getAttribute('end');
$lines = '';

$textNodes = $p->childNodes;
foreach ($textNodes as $node) {
if ($node->nodeType === XML_TEXT_NODE) {
$lines .= $node->nodeValue;
} else {
$lines .= $dom->saveXML($node); // Preserve HTML tags
}
}

$lines = preg_replace('/<br\s*\/?>/', '<br>', $lines); // normalize <br>*/
$lines = explode('<br>', $lines);
$lines = array_map('strip_tags', $lines);
$lines = array_map('trim', $lines);

$array[] = array(
'start' => static::ttmlTimeToInternal($begin),
'end' => static::ttmlTimeToInternal($end),
'lines' => $lines,
);
if (!$body) {
throw new \Exception('no body');
}
$div = $body->getElementsByTagName('div')->item(0);
if (!$div) {
throw new \Exception('no div');
}
$pElements = $div->getElementsByTagName('p');
foreach ($pElements as $p) {
$begin = $p->getAttribute('begin');
$end = $p->getAttribute('end');
$lines = '';

$textNodes = $p->childNodes;
foreach ($textNodes as $node) {
if ($node->nodeType === XML_TEXT_NODE) {
$lines .= $node->nodeValue;
} else {
$lines .= $dom->saveXML($node); // Preserve HTML tags
}
}

$lines = preg_replace('/<br\s*\/?>/', '<br>', $lines); // normalize <br>*/
$lines = explode('<br>', $lines);
$lines = array_map('strip_tags', $lines);
$lines = array_map('trim', $lines);

$array[] = array(
'start' => static::ttmlTimeToInternal($begin),
'end' => static::ttmlTimeToInternal($end),
'lines' => $lines,
);
}

return $array;
Expand Down Expand Up @@ -112,7 +114,10 @@ protected static function internalTimeToTtml($internal_time)

protected static function ttmlTimeToInternal($ttml_time)
{
if (substr($ttml_time, -1) === 's') {
if (substr($ttml_time, -1) === 't') { // if last symbol is "t"
// parses 340400000t
return substr($ttml_time, 0, -1) / 10000000;
} elseif (substr($ttml_time, -1) === 's') {
return rtrim($ttml_time, 's');
} else {
$timeParts = explode(':', $ttml_time);
Expand Down
2 changes: 1 addition & 1 deletion tests/files/dfxp_with_different_br.dfxp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
</head>
<body>
<div style="s1" xml:id="d1">
<p xml:id="p1" begin="00:00:00,000" end="00:00:01,000" region="bottomCenter">one<br/>two<br>three<br />four</p>
<p xml:id="p1" begin="00:00:00,000" end="00:00:01,000" region="bottomCenter">one<br/>two<br />three</p>
</div>
</body>
</tt>
19 changes: 4 additions & 15 deletions tests/formats/DfxpTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -39,31 +39,20 @@ public function testConvertFromDfxpToSrt()
$srt_path = './tests/files/srt.srt';
$dfxp_path = './tests/files/dfxp.dfxp';

// stl to srt
$dfxp_object = Subtitles::loadFromFile($dfxp_path);
$stl_internal_format = $dfxp_object->getInternalFormat();
$actual = $dfxp_object->getInternalFormat();

$srt_object = Subtitles::loadFromFile($srt_path);
$srt_internal_format = $srt_object->getInternalFormat();
// compare both internal formats
foreach ($srt_internal_format as $block_key => $srt_block) {
$start_time_diff = abs($srt_block['start'] - $stl_internal_format[$block_key]['start']);
$this->assertLessThan(0.1, $start_time_diff);

$end_time_diff = abs($srt_block['end'] - $stl_internal_format[$block_key]['end']);
$this->assertLessThan(0.1, $end_time_diff);
$expected = $srt_object->getInternalFormat();

foreach ($srt_block['lines'] as $line_key => $srt_line) {
$this->assertEquals($srt_line, $stl_internal_format[$block_key]['lines'][$line_key]);
}
}
$this->assertInternalFormatsEqual($expected, $actual);
}

public function testParsesDifferentBr()
{
$dfxp_object = Subtitles::loadFromFile('./tests/files/dfxp_with_different_br.dfxp');
$actual = $dfxp_object->getInternalFormat();
$expected = (new Subtitles())->add(0, 1, ['one', 'two', 'three', 'four'])->getInternalFormat();
$expected = (new Subtitles())->add(0, 1, ['one', 'two', 'three'])->getInternalFormat();
$this->assertInternalFormatsEqual($expected, $actual);
}

Expand Down

0 comments on commit 6bbdbaa

Please sign in to comment.