Commit 4901d1ad authored by Wendell Piez's avatar Wendell Piez

Framing out JATS conversion pipeline

parent 5b82b0b9
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="3.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:xsw="http://coko.foundation/xsweet"
exclude-result-prefixes="#all">
<xsl:output method="xml" indent="yes" omit-xml-declaration="yes"
doctype-public="-//NLM//DTD JATS (Z39.96) Article Authoring DTD with MathML3 v1.1 20151215//EN" doctype-system="JATS-articleauthoring1-mathml3.dtd"
/>
<xsl:variable name="transformation-sequence">
<xsw:transform version="3.0">xhtml-jats.xsl</xsw:transform>
<xsw:transform version="3.0">last-jatsify.xsl</xsw:transform>
<!--<xsw:transform version="2.0">itemize-lists.xsl</xsw:transform>-->
</xsl:variable>
<!-- Dummy template quiets anxious XSLT engines when HTML is provided as input. -->
<xsl:template match="/html:html" xmlns:html="http://www.w3.org/1999/xhtml">
<xsl:next-match/>
</xsl:template>
<!-- traps the root node of the source and passes it down the chain of transformation references -->
<xsl:template match="/">
<xsl:variable name="source" select="."/>
<xsl:iterate select="$transformation-sequence/*">
<xsl:param name="sourcedoc" select="$source" as="document-node()"/>
<xsl:on-completion select="$sourcedoc"/>
<xsl:next-iteration>
<xsl:with-param name="sourcedoc">
<xsl:apply-templates select=".">
<xsl:with-param name="sourcedoc" select="$sourcedoc"/>
</xsl:apply-templates>
</xsl:with-param>
</xsl:next-iteration>
</xsl:iterate>
</xsl:template>
<xsl:template match="xsw:transform">
<xsl:param name="sourcedoc" as="document-node()"/>
<xsl:variable name="xslt-spec" select="."/>
<xsl:variable name="runtime" select="map {
'xslt-version' : xs:decimal($xslt-spec/@version),
'stylesheet-location' : string($xslt-spec),
'source-node' : $sourcedoc }" />
<!-- The function returns a map; primary results are under 'output'
unless a base output URI is given
https://www.w3.org/TR/xpath-functions-31/#func-transform -->
<xsl:sequence select="transform($runtime)?output"/>
</xsl:template>
<!-- Not knowing any better, we simply pass along. -->
<xsl:template match="*">
<xsl:param name="sourcedoc" as="document-node()"/>
<xsl:sequence select="$sourcedoc"/>
</xsl:template>
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<x:description xmlns:x="http://www.jenitennison.com/xslt/xspec" stylesheet="xhtml-jats.xsl">
<x:scenario label="Basic inlines">
<x:context>
<body xmlns="http://www.w3.org/1999/xhtml">
<p>
<a href="file.html">external link</a>
<a href="#target">internal link</a>
<b>bold</b>
<!--<br/> break-->
<cite>cite</cite>
<code>inline code</code>
<em>emphasis</em>
<i>italic</i>
<q>quoth</q>
<strong>strong</strong>
<sub>subscript</sub>
<sup>superscript</sup>
<u>underline</u>
</p>
<p id="target">Target p with id</p>
</body>
</x:context>
<x:expect label="JATS result">
<body>
<p>
<ext-link xlink:href="file.html" xmlns:xlink="http://www.w3.org/1999/xlink">external link</ext-link>
<xref rid="target">internal link</xref>
<bold>bold</bold>
<mixed-citation>cite</mixed-citation>
<code>inline code</code>
<italic>emphasis</italic>
<italic>italic</italic>
<styled-content style="quoted">quoth</styled-content>
<bold>strong</bold>
<sub>subscript</sub>
<sup>superscript</sup>
<underline>underline</underline>
</p>
<p id="target">Target p with id</p>
</body>
</x:expect></x:scenario>
<x:scenario label="Lists of whatever sort">
<x:context>
<ul>
<li>Here's stuff in a list</li>
</ul>
<ul>
<li><p>Here's stuff in a list</p>
<p>but structured with paragraphs</p></li>
<li><p>Another item</p></li>
<li>list item with <br/> implicit structure</li>
</ul>
</x:context>
</x:scenario>
</x:description>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE book
PUBLIC "-//NLM//DTD BITS Book Interchange DTD with OASIS and XHTML Tables v2.0 20151225//EN"
"BITS-book2.dtd">
<book dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:mml="http://www.w3.org/1998/Math/MathML">
<book-meta>
<book-title-group>
<book-title/>
</book-title-group>
<contrib-group>
<contrib>
<contrib-id/>
<name>
<surname/>
<given-names/>
</name>
</contrib>
</contrib-group>
</book-meta>
<book-body>
<book-part>
<!--<book-part-meta>
<title-group>
<title/>
</title-group>
</book-part-meta>-->
<body>
<p></p>
</body>
</book-part>
</book-body>
<book-back>
<ref-list>
<ref>
<mixed-citation/>
</ref>
</ref-list>
</book-back>
</book>
<!DOCTYPE article
PUBLIC "-//NLM//DTD JATS (Z39.96) Article Authoring DTD with MathML3 v1.1 20151215//EN" "JATS-articleauthoring1-mathml3.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:mml="http://www.w3.org/1998/Math/MathML"
dtd-version="1.1">
<front>
<article-meta>
<title-group>
<article-title/>
</title-group>
<contrib-group>
<contrib>
<name>
<surname/>
<given-names/>
</name>
</contrib>
</contrib-group>
<abstract>
<p/>
</abstract>
</article-meta>
</front>
<body>
<sec>
<title/>
<sec>
<title>Specimen Document</title>
<p/>
<p>Here’s a specimen document for purposes of testing XSweet support for conversion from HTML to JATS.</p>
<p>We start as a Word document so as to be able to produce “typical” HTML Typescript as produced by XSweet. This enables us to map the specific sorts of results we are likely to see there, including “peculiar” and uncontrolled inputs such as footnotes, etc. with their implicit target semantics.</p>
<sec>
<title>Formatting</title>
<p>Formatting includes inline stuff such as <bold>bold</bold>, <italic>italics</italic>, <underline>underlines</underline>, <sup>super</sup>script and <sub>sub</sub>script.</p>
</sec>
<sec>
<title>Sections</title>
<p>Sections are implicit, but might result from a successful XSweet section induction process. So we might wish to model them as well. They will be represented by <named-content content-type="xsw_fontConsolas9pt"/> elements without non-div following siblings…</p>
</sec>
<sec>
<title>Lists</title>
<p>We have lists, including the usual sorts:</p>
<list list-type="bullet">
<list-item>
<p>Bullet item one</p>
</list-item>
<list-item>
<p>Second bullet</p>
</list-item>
<list-item>
<p>Third bullet</p>
</list-item>
<list-item>
<p>Fourth bullet</p>
</list-item>
</list>
<p>And numbered lists</p>
<list list-type="bullet">
<list-item>
<p>Numbered item one</p>
</list-item>
<list-item>
<p>Number two</p>
</list-item>
<list-item>
<p>Number three</p>
</list-item>
<list-item>
<p>Number four</p>
</list-item>
</list>
<p>Tables</p>
<p>If you thought we’d neglects tables you are regrettably incorrect!</p>
<table-wrap>
<table>
<tr>
<td>
<p>One</p>
</td>
<td>
<p>Two</p>
</td>
<td>
<p>Three</p>
</td>
<td>
<p>Four</p>
</td>
</tr>
<tr>
<td>
<p>Ichi</p>
</td>
<td>
<p>Ni</p>
</td>
<td>
<p>San</p>
</td>
<td>
<p>Shi</p>
</td>
</tr>
<tr>
<td>
<p>Eins</p>
</td>
<td>
<p>Zwei</p>
</td>
<td>
<p>Drei</p>
</td>
<td>
<p>Vier</p>
</td>
</tr>
</table>
</table-wrap>
<p>
<target id="docx-bookmark_0"/>
<xref rid="docx-bookmark_0"/>
</p>
<p/>
<p/>
</sec>
</sec>
</sec>
<sec>
<title/>
</sec>
<sec>
<title/>
</sec>
</body>
</article>
<?xsweet touched by header promotion logic: 2019-01-14-05:00?>
<?xsweet header promotion by outline levels (by default, from detected outline levels) ?>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Specimen Document</title>
<meta charset="UTF-8"/>
<style type="text/css">
.Heading1 { ; /* Normal*/ ; /* Heading1*/ margin-top: 24pt; margin-bottom: 0pt; -xsweet-outline-level: 0; font-weight: bold; color: #365F91; font-size: 14pt }
.Heading2 { ; /* Normal*/ ; /* Heading2*/ margin-top: 10pt; margin-bottom: 0pt; -xsweet-outline-level: 1; font-weight: bold; color: #4F81BD; font-size: 13pt }
.ListParagraph { ; /* Normal*/ ; /* ListParagraph*/ margin-left: 36pt }</style>
<style type="text/css">
.xsw_marginbottom0ptmargintop24ptfont14ptoutlinelevel0bold365F91 { color: #365F91; font-size: 14pt; font-weight: bold; margin-bottom: 0pt; margin-top: 24pt; -xsweet-outline-level: 0 }
.xsw_marginbottom0ptmargintop10ptfont13ptoutlinelevel1bold4F81BD { color: #4F81BD; font-size: 13pt; font-weight: bold; margin-bottom: 0pt; margin-top: 10pt; -xsweet-outline-level: 1 }
.xsw_fontConsolas9pt { font-family: Consolas; font-size: 9pt }
.xsw_marginleft36ptlistlevel0 { margin-left: 36pt; -xsweet-list-level: 0 }
.xsw_marginbottom0pt { border-bottom-style: solid; border-bottom-width: 0.5pt; border-left-style: solid; border-left-width: 0.5pt; border-right-style: solid; border-right-width: 0.5pt; border-top-style: solid; border-top-width: 0.5pt; margin-bottom: 0pt; vertical-align: top }</style>
</head>
<body>
<div class="docx-body">
<section>
<h1 class="Heading1 xsw_marginbottom0ptmargintop24ptfont14ptoutlinelevel0bold365F91"
data-xsweet-outline-level="0">Specimen Document</h1>
<p><!-- empty --></p>
<p>Here’s a specimen document for purposes of testing XSweet support for conversion from HTML to JATS.</p>
<p>We start as a Word document so as to be able to produce “typical” HTML Typescript as produced by XSweet. This enables us to map the specific sorts of results we are likely to see there, including “peculiar” and uncontrolled inputs such as footnotes, etc. with their implicit target semantics.</p>
<section>
<h2 class="Heading2 xsw_marginbottom0ptmargintop10ptfont13ptoutlinelevel1bold4F81BD"
data-xsweet-outline-level="1">Formatting</h2>
<p>Formatting includes inline stuff such as <b>bold</b>, <i>italics</i>, <u>underlines</u>, <sup>super</sup>script and <sub>sub</sub>script.</p>
</section>
<section>
<h2 class="Heading2 xsw_marginbottom0ptmargintop10ptfont13ptoutlinelevel1bold4F81BD"
data-xsweet-outline-level="1">Sections</h2>
<p>Sections are implicit, but might result from a successful XSweet section induction process. So we might wish to model them as well. They will be represented by <span class="xsw_fontConsolas9pt">div</span> elements without non-div following siblings…</p>
</section>
<section>
<h2 class="Heading2 xsw_marginbottom0ptmargintop10ptfont13ptoutlinelevel1bold4F81BD"
data-xsweet-outline-level="1">Lists</h2>
<p>We have lists, including the usual sorts:</p>
<ul>
<li>
<p class="ListParagraph xsw_marginleft36ptlistlevel0"
data-xsweet-list-level="0">Bullet item one</p>
</li>
<li>
<p class="ListParagraph xsw_marginleft36ptlistlevel0"
data-xsweet-list-level="0">Second bullet</p>
</li>
<li>
<p class="ListParagraph xsw_marginleft36ptlistlevel0"
data-xsweet-list-level="0">Third bullet</p>
</li>
<li>
<p class="ListParagraph xsw_marginleft36ptlistlevel0"
data-xsweet-list-level="0">Fourth bullet</p>
</li>
</ul>
<p>And numbered lists</p>
<ul>
<li>
<p class="ListParagraph xsw_marginleft36ptlistlevel0"
data-xsweet-list-level="0">Numbered item one</p>
</li>
<li>
<p class="ListParagraph xsw_marginleft36ptlistlevel0"
data-xsweet-list-level="0">Number two</p>
</li>
<li>
<p class="ListParagraph xsw_marginleft36ptlistlevel0"
data-xsweet-list-level="0">Number three</p>
</li>
<li>
<p class="ListParagraph xsw_marginleft36ptlistlevel0"
data-xsweet-list-level="0">Number four</p>
</li>
</ul>
<p>Tables</p>
<p>If you thought we’d neglects tables you are regrettably incorrect!</p>
<table>
<tr>
<td class="xsw_marginbottom0pt">
<p>One</p>
</td>
<td class="xsw_marginbottom0pt">
<p>Two</p>
</td>
<td class="xsw_marginbottom0pt">
<p>Three</p>
</td>
<td class="xsw_marginbottom0pt">
<p>Four</p>
</td>
</tr>
<tr>
<td class="xsw_marginbottom0pt">
<p>Ichi</p>
</td>
<td class="xsw_marginbottom0pt">
<p>Ni</p>
</td>
<td class="xsw_marginbottom0pt">
<p>San</p>
</td>
<td class="xsw_marginbottom0pt">
<p>Shi</p>
</td>
</tr>
<tr>
<td class="xsw_marginbottom0pt">
<p>Eins</p>
</td>
<td class="xsw_marginbottom0pt">
<p>Zwei</p>
</td>
<td class="xsw_marginbottom0pt">
<p>Drei</p>
</td>
<td class="xsw_marginbottom0pt">
<p>Vier</p>
</td>
</tr>
</table>
<p>
<a class="bookmarkStart" id="docx-bookmark_0"><!-- bookmark ='_GoBack'--></a>
<a href="#docx-bookmark_0"><!-- bookmark end --></a>
</p>
<p><!-- empty --></p>
<p><!-- empty --></p>
</section>
</section>
</div>
<div class="docx-endnotes"><!-- empty --></div>
<div class="docx-footnotes"><!-- empty --></div>
</body>
</html>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article
PUBLIC "-//NLM//DTD JATS (Z39.96) Article Authoring DTD with MathML3 v1.1 20151215//EN"
"JATS-articleauthoring1-mathml3.dtd">
<article dtd-version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:mml="http://www.w3.org/1998/Math/MathML">
<front>
<article-meta>
<title-group>
<article-title/>
</title-group>
<contrib-group>
<contrib>
<contrib-id/>
<name>
<surname/>
<given-names/>
</name>
</contrib>
</contrib-group>
<abstract>
<p></p>
</abstract>
</article-meta>
</front>
<body>
<p>
<!--<a href="file.html">external link</a>-->
<ext-link xlink:href="file.html">external link</ext-link>
<!--<a href="#t">internal link</a>-->
<xref rid="t">internal link</xref>
<!--<b>bold</b>-->
<bold>bold</bold>
<!--<br/> break-->
<!--<cite>cite</cite>-->
<code>inline code</code>
<!--<em>emphasis</em> JATS has no generic 'emph' equivalent -->
<italic>emphasis</italic>
<!--<i>italic</i>-->
<italic>italic</italic>
<!--<q>quoth</q>-->
<styled-content style="quoted"></styled-content>
<!--<strong>strong</strong>-->
<bold>strong</bold>
<sub>subscript</sub>
<sup>superscript</sup>
<!--<u>underline</u>-->
<underline>underline</underline>
</p>
<p id="t"/>
<!--<ul>
<li>Here's stuff in a list</li>
</ul>-->
<!--<ul>
<li><p>Here's stuff in a list</p>
<p>but structured with paragraphs</p></li>
<li><p>Another item</p></li>
<li>list item with <br/> implicit structure</li>
</ul>-->
</body>
</article>
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:math="http://www.w3.org/2005/xpath-functions/math"
exclude-result-prefixes="xs math"
version="3.0">
<xsl:mode on-no-match="shallow-copy"/>
<xsl:template match="/">
<article dtd-version="1.1" xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:mml="http://www.w3.org/1998/Math/MathML">
<xsl:copy-of select="$front-matter"/>
<xsl:apply-templates/>
</article>
</xsl:template>
<xsl:variable name="front-matter">
<front>
<article-meta>
<title-group>
<article-title><!-- article title --></article-title>
</title-group>
<contrib-group>
<contrib>
<name>
<surname><!-- surname --></surname>
<given-names><!-- given names --></given-names>
</name>
</contrib>
</contrib-group>
<abstract>
<p><!-- abstract paragraph --></p>
</abstract>
</article-meta>
</front>
</xsl:variable>
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:math="http://www.w3.org/2005/xpath-functions/math"
xmlns="http://www.w3.org/1999/xhtml"
xpath-default-namespace="http://www.w3.org/1999/xhtml"
xmlns:xsw="http://coko.foundation/xsweet"
exclude-result-prefixes="#all"
version="2.0">
<!-- XSweet: Performs header promotion based on outline level [2] -->
<!-- Input: an HTML Typescript document (wf) -->
<!-- Output: a copy, with headers promoted according to outline levels detected on paragraphs -->
<xsl:output method="xml" indent="no" omit-xml-declaration="yes"/>
<xsl:template match="node() | @*">
<xsl:copy>
<xsl:apply-templates select="node() | @*"/>
</xsl:copy>
</xsl:template>
<!-- Produces header elements by matching on nominal outline level as given in an xsweet-outline-level CSS pseudo-property
(treated here with brute force). -->
<xsl:function name="xsw:outline-level" as="xs:integer?">
<xsl:param name="who" as="node()"/>
<!--<xsl:variable name="outline-spec" select="replace($who/@style,'^.*xsweet\-outline\-level:\s*','')"/>
<xsl:variable name="outline-level" select="replace($outline-spec,'\D.*$','')"/>-->
<xsl:variable name="outline-level" select="$who/@data-xsweet-outline-level"/>
<xsl:if test="$outline-level castable as xs:integer">
<xsl:sequence select="xs:integer($outline-level) + 1"/>
</xsl:if>
</xsl:function>
<xsl:variable name="level-map" as="element()*">
<xsl:for-each-group select="//body//p[exists(xsw:outline-level(.))]" group-by="xsw:outline-level(.)">
<xsl:sort select="xsw:outline-level(.)"/>
<xsl:if test="position() le 6">
<xsl:element name="h{position()}">
<xsl:attribute name="level" select="current-grouping-key()"/>
<!--<xsl:apply-templates/>-->
</xsl:element>
</xsl:if>
</xsl:for-each-group>
</xsl:variable>
<xsl:template match="body">
<body>
<xsl:copy-of select="@*"/>
<!-- Diagnostic ....-->
<!--<xsl:copy-of select="$level-map"/>-->
<xsl:apply-templates/>
</body>
</xsl:template>
<xsl:template match="p[xsw:outline-level(.) = $level-map/@level]">
<xsl:variable name="given-level" select="xsw:outline-level(.)"/>
<xsl:variable name="h-level" select="$level-map[@level = $given-level]/local-name()"/>
<xsl:element name="{$h-level}" namespace="http://www.w3.org/1999/xhtml">
<xsl:copy-of select="@*"/>
<!--<xsl:comment expand-text="true">{ $level }</xsl:comment>-->
<xsl:apply-templates/>
</xsl:element>
</xsl:template>
</xsl:stylesheet>
\ No newline at end of file
# HTML to JATS conversion
Version of JATS? (Authoring schema?)
Serialization - indented XML, DOCTYPE declaration ...
Sample instance cf `Conversion-zoo.docx`
XSpec
## Body mapping
* structures
* paragraph
* lists with and without sub-paragraphs and lists
* table
* div
* section
* aside? block quotes? specialized list types? other HTML blocks?
* inline elements
* see xspec
* footnotes and/or endnotes
* encode note body in line using `<fn>` w/ `<xref>` for subsequent references
* two separate sequences/note types when both are present?
* images / figures?
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns:math="http://www.w3.org/2005/xpath-functions/math"
exclude-result-prefixes="xs math"
version="3.0"
xpath-default-namespace="http://www.w3.org/1999/xhtml">
<xsl:template match="html">
<xsl:apply-templates select="body"/>
</xsl:template>
<!-- By default we cast over to the target (with no no namespace) -->
<!-- This may result in invalid results for any HTML not accounted for. -->
<xsl:template match="*">
<xsl:element name="{ local-name() }">
<xsl:apply-templates select="@*"/>
<xsl:apply-templates/>
</xsl:element>
</xsl:template>
<xsl:template match="@class | @data-xsweet-list-level | @data-xsweet-outline-level"/>
<xsl:template match="@*">
<xsl:copy-of select="."/>
</xsl:template>
<xsl:template match="p">
<p>
<xsl:apply-templates select="@*"/>
<xsl:apply-templates/>
</p>
</xsl:template>
<xsl:template priority="2" match="a[starts-with(@href,'#')]">
<xref rid="{substring-after(@href,'#')}">
<xsl:apply-templates/>
</xref>
</xsl:template>
<xsl:template match="a[@href]">
<ext-link xlink:href="{@href}" xmlns:xlink="http://www.w3.org/1999/xlink">
<xsl:apply-templates/>
</ext-link>
</xsl:template>
<xsl:template match="a[empty(@href)]">
<target>
<xsl:apply-templates select="@*"/>
<xsl:apply-templates/>
</target>
</xsl:template>
<xsl:template match="b | strong">
<bold>
<xsl:apply-templates/>
</bold>
</xsl:template>
<!-- usually not going to be valid:
<xsl:template match="br">
<lb>
<xsl:apply-templates/>
</lb>
</xsl:template> -->
<xsl:template match="cite">
<mixed-citation>
<xsl:apply-templates/>
</mixed-citation>
</xsl:template>
<xsl:template match="code">
<code>
<xsl:apply-templates/>
</code>
</xsl:template>
<xsl:template match="em | i">
<italic>
<xsl:apply-templates/>
</italic>
</xsl:template>
<xsl:template match="q">
<styled-content style="quoted">
<xsl:apply-templates/>
</styled-content>
</xsl:template>
<xsl:template match="sub">
<sub>