ade-outils-xml/xml/wiley-sample.xml at be7323487dc5b2c4224f10afe75c83a2bf7d3bb6

Fork: 0
istex / ade-outils-xml
Find file
Newer
Older
ade-outils-xml / xml / wiley-sample.xml
niederle on 29 Dec 2015 139 KB 1er commit
Raw Blame History
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<component xmlns="http://www.wiley.com/namespaces/wiley" version="2.0" type="serialArticle" xml:lang="en">
  <header>
    <publicationMeta level="product">
      <publisherInfo>
        <publisherName>Wiley Subscription Services, Inc., A Wiley Company</publisherName>
        <publisherLoc>Hoboken</publisherLoc>
      </publisherInfo>
      <doi registered="yes">10.1002/(ISSN)1532-2890</doi>
      <issn type="print">1532-2882</issn>
      <issn type="electronic">1532-2890</issn>
      <idGroup>
        <id type="product" value="ASI" />
      </idGroup>
      <titleGroup>
        <title type="main" xml:lang="en" sort="JOURNAL OF THE AMERICAN SOCIETY FOR INFORMATION SCIENCE AND TECHNOLOGY">Journal of the American Society for Information Science and Technology</title>
        <title type="short">J. Am. Soc. Inf. Sci.</title>
      </titleGroup>
      <selfCitationGroup>
        <citation type="ancestor" xml:id="cit1">
          <journalTitle>Journal of the American Society for Information Science</journalTitle>
          <accessionId ref="info:x-wiley/issn/00028231">0002-8231</accessionId>
          <accessionId ref="info:x-wiley/issn/10974571">1097-4571</accessionId>
          <pubYear year="2000">2000</pubYear>
          <vol>51</vol>
          <issue>14</issue>
        </citation>
      </selfCitationGroup>
    </publicationMeta>
    <publicationMeta level="part" position="90">
      <doi origin="wiley" registered="yes">10.1002/asi.v62.9</doi>
      <numberingGroup>
        <numbering type="journalVolume" number="62">62</numbering>
        <numbering type="journalIssue">9</numbering>
      </numberingGroup>
      <coverDate startDate="2011-09">September 2011</coverDate>
    </publicationMeta>
    <publicationMeta level="unit" type="article" position="4" status="forIssue">
      <doi origin="wiley" registered="yes">10.1002/asi.21591</doi>
      <idGroup>
        <id type="unit" value="ASI21591" />
      </idGroup>
      <countGroup>
        <count type="pageTotal" number="21" />
      </countGroup>
      <titleGroup>
        <title type="articleCategory">Research Article</title>
        <title type="tocHeading1">Research Articles</title>
      </titleGroup>
      <copyright ownership="publisher">© 2011 ASIS&amp;T</copyright>
      <eventGroup>
        <event type="manuscriptReceived" date="2011-03-17" />
        <event type="manuscriptRevised" date="2011-05-18" />
        <event type="manuscriptAccepted" date="2011-05-19" />
        <event type="xmlConverted" agent="Converter:JWSART34_TO_WML3G version:2.5.2 mode:FullText mathml2tex" date="2011-08-10" />
        <event type="publishedOnlineEarlyUnpaginated" date="2011-07-05" />
        <event type="firstOnline" date="2011-07-05" />
        <event type="publishedOnlineFinalForm" date="2011-08-10" />
        <event type="xmlConverted" agent="Converter:WILEY_ML3G_TO_WILEY_ML3GV2 version:3.8.8" date="2014-01-06" />
        <event type="xmlConverted" agent="Converter:WML3G_To_WML3G version:4.1.7 mode:FullText,remove_FC" date="2014-10-30" />
      </eventGroup>
      <numberingGroup>
        <numbering type="pageFirst">1696</numbering>
        <numbering type="pageLast">1716</numbering>
      </numberingGroup>
      <subjectInfo>
        <subject href="psi.asis.org/digital/data+formats">data formats</subject>
        <subject href="psi.asis.org/digital/overlap">overlap</subject>
        <subject href="psi.asis.org/digital/markup+languges">markup languges</subject>
        <subject href="psi.asis.org/digital/semantic+web">semantic web</subject>
        <subject href="psi.asis.org/digital/data+conversion">data conversion</subject>
      </subjectInfo>
      <linkGroup>
        <link type="toTypesetVersion" href="file:ASI.ASI21591.pdf" />
      </linkGroup>
    </publicationMeta>
    <contentMeta>
      <countGroup>
        <count type="figureTotal" number="7" />
        <count type="tableTotal" number="1" />
        <count type="referenceTotal" number="39" />
      </countGroup>
      <titleGroup>
        <title type="main" xml:lang="en">A Semantic Web approach to everyday overlapping markup</title>
      </titleGroup>
      <creators>
        <creator xml:id="au1" creatorRole="author" affiliationRef="#af1">
          <personName>
            <givenNames>Angelo</givenNames>
            <familyName>Di Iorio</familyName>
          </personName>
          <contactDetails>
            <email>diiorio@cs.unibo.it</email>
          </contactDetails>
        </creator>
        <creator xml:id="au2" creatorRole="author" affiliationRef="#af1">
          <personName>
            <givenNames>Silvio</givenNames>
            <familyName>Peroni</familyName>
          </personName>
          <contactDetails>
            <email>speroni@cs.unibo.it</email>
          </contactDetails>
        </creator>
        <creator xml:id="au3" creatorRole="author" affiliationRef="#af1">
          <personName>
            <givenNames>Fabio</givenNames>
            <familyName>Vitali</familyName>
          </personName>
          <contactDetails>
            <email>fabio@cs.unibo.it</email>
          </contactDetails>
        </creator>
      </creators>
      <affiliationGroup>
        <affiliation xml:id="af1" countryCode="IT" type="organization">
          <unparsedAffiliation>Department of Computer Science, University of Bologna, Bologna, Italy</unparsedAffiliation>
        </affiliation>
      </affiliationGroup>
      <abstractGroup>
        <abstract type="main" xml:lang="en">
          <title type="main">Abstract</title>
          <p>Overlapping structures in XML are not symptoms of a misunderstanding of the intrinsic characteristics of a text document nor evidence of extreme scholarly requirements far beyond those needed by the most common XML‐based applications. On the contrary, overlaps have started to appear in a large number of incredibly popular applications hidden under the guise of syntactical tricks to the basic hierarchy of the XML data format. Unfortunately, syntactical tricks have the drawback that the affected structures require complicated workarounds to support even the simplest query or usage. In this article, we present Extremely Annotational Resource Description Framework (RDF) Markup (EARMARK), an approach to overlapping markup that simplifies and streamlines the management of multiple hierarchies on the same content, and provides an approach to sophisticated queries and usages over such structures without the need of ad‐hoc applications, simply by using Semantic Web tools and languages. We compare how relevant tasks (e.g., the identification of the contribution of an author in a word processor document) are of some substantial complexity when using the original data format and become more or less trivial when using EARMARK. We finally evaluate positively the memory and disk requirements of EARMARK documents in comparison to Open Office and Microsoft Word XML‐based formats.</p>
        </abstract>
      </abstractGroup>
    </contentMeta>
  </header>

  <body sectionsNumbered="no">
    <section xml:id="sec1-1">
      <title type="main">Introduction</title>
      <p>The overwhelming consensus among XML practitioners is that documents are trees, the hierarchy is the fundamental data structure, and violations of the hierarchy are errors or unnecessary complications. Therefore, overlapping markup has received ambivalent, almost schizoid considerations in the field of markup languages. Traditionally, overlaps were the hallmarks of bad HTML coders and nave HTML page editors, taking advantage of the unjustified benevolence in web browsers that would display basically any HTML regardless of proper nesting. At the same time, far from the awareness of the general public, overlaps have been a fringe, almost esoteric, discipline of scholars in the humanities, competently used for arcane specifications of linguistic annotations and literary analysis.</p>
      <p>Although the first type of overlap was judged with scorn and the second with awe, they both fundamentally represent a situation that is more common than was thought, and the scholars were only more aware, and not more justified, about the need to represent overlaps.</p>
      <p>Generally, overlap is needed when multiple independent items refer to the same segment, either when considering textual markup documents or multimedia structures (Salembier &amp; Benitez,
        <link href="#bib30">2007</link>). Regarding documents with markup, we need overlap whenever multiple markup elements need to be applied over the same content, and these elements happen to be independent of each other. In some (rather frequent) situations, this independence means that the content referred to by some elements is partially, but not completely, the same as the content referred to by other elements.</p>
      <p>This situation is more frequent than it may appear: Not only do bad HTML code and arcane linguistic annotations use overlap, but many more mainstream and mundane examples exist. For instance, change tracking in an office document is often at odds with the underlying structure of the text, microformats (Allsopp,
        <link href="#bib2">2007</link>) and Resource Description Framework–in–attributes (RDFa; Adida, Birbeck, McCarron, &amp; Pemberton,
        <link href="#bib1">2008</link>) annotations may need to refer to concepts that span across multiple XML elements, complex data structures (e.g., biological data) force graphs into trees and hide multiple parentage as internal references, and so on.</p>
      <p>Differently from SGML, which is able to handle some overlapping scenarios through the CONCUR notation (Goldfarb,
        <link href="#bib18">1990</link>), XML grammatically imposes and requires a strict hierarchy of containment generating a single mathematical tree of the document where no overlap is allowed. This requirement has been turned into an intrinsic characteristic of the documents XML was meant to represent rather than a syntactical and conceptual constraint into which these documents need to fit. Thus, whenever authors needed to cope with independent markup elements, they managed either by navely ignoring the hierarchical limitation (and therefore creating invalid documents) or by creating careful workarounds within the syntactical constraint, or even by inventing completely new markup languages that allow some types of overlap. But while new multihierarchical markup languages such as TexMecs (Huitfeldt &amp; Sperberg‐McQueen,
        <link href="#bib21">2003</link>) and LMNL (Tennison &amp; Piez,
        <link href="#bib37">2002</link>) have a small number of adepts and applications, and while bad HTML coders and bad HTML page editors are disappearing from the market, the careful workarounds within the XML syntax (TEI Consortium,
        <link href="#bib36">2005</link>), such as segmentation, milestones, or standoff markup, are to this day frequently used and ubiquitous.</p>
      <p>All workarounds share the same approach of hiding structural information about a secondary hierarchy under the guise of something else: split individual elements, empty boundary elements, indirect references, and so on. The result is that the secondary structural information is hidden or its importance is lessened to not break or obfuscate the main hierarchy expressed in the visible XML structure. But this comes at a price: Structures specified through workarounds are more difficult to find, identify, and act upon than the are structures in the main XML hierarchy. Thus, trivial searches that should amount to a short XPath in a more direct situation end up being multiple‐lines long, pretty basic visualizations require incredibly complex XSLT stylesheets, specific choices of the main markup hierarchy actually prevent some features of the secondary markup to even exist, and so on. So, although workarounds exist and can be used, hierarchies expressed through them are “second‐class citizens” that cannot fully exploit the sophisticated tools that the XML language provides.</p>
      <p>In this article, we show how Extremely Annotational Resource Description Framework (RDF) Markup (EARMARK), our proposal for managing overlapping markup, does not generate first‐ and second‐class hierarchies, and allows existing, sophisticated tools to be used on all markup—even in the presence of overlaps. Rather than creating a completely new language requiring completely new tools and competencies, EARMARK uses Semantic Web technologies and Semantic Web tools to obtain many of the results obtainable with usual XML tools.</p>
      <p>EARMARK defines markup vocabularies by means of OWL ontologies (W3C OWL Working Group,
        <link href="#bib39">2009</link>). Since each individual markup item is an independent assertion over some content or some other assertions, overlaps of content are not a problem, nor are all the issues connected to physical embedding and containments, such as contiguity and document order. Furthermore, by using standard Semantic Web technologies, fairly sophisticated functionalities can be provided over EARMARK documents.</p>
      <p>Through EARMARK, operations that were previously very hard or impossible exactly because of the interferences of the multiple hierarchies or of the workarounds they employed now become fundamentally trivial since no syntactical tricks are employed and the different hierarchies do not interfere with each other. Thus, for instance, identifying the individual contributions in a multi‐authored MS Word or Open Office document is quite hard on their original XML formats, and becomes trivial when the same documents are converted into EARMARK.</p>
      <p>This article is an extended version of previous works on EARMARK (Di Iorio, Peroni, &amp; Vitali,
        <link href="#bib11">2010</link>; Peroni &amp; Vitali,
        <link href="#bib27">2009</link>). In those works, we focused on identifying workarounds for overlapping data existing in real XML documents and translating them into EARMARK assertions. We also sketched the EARMARK ontology and presented a simple implementation of EARMARK‐aware tools. This article follows and extends them and also provides some novel contributions:
        <list xml:id="l1" style="bulleted">
          <listItem>
            <p>The systematic analysis of the EARMARK model, with particular attention to data typing and overlapping structures</p>
          </listItem>
          <listItem>
            <p>The discussion of further applications for the ontological EARMARK approach. In particular, we show how EARMARK can be used to improve the content filtering and reversions mechanisms of wikis.</p>
          </listItem>
          <listItem>
            <p>The brief description of a process, called <i>ROCCO</i>, for generating EARMARK documents from existing XML documents (even ones that use workarounds for overlapping structures)</p>
          </listItem>
          <listItem>
            <p>An evaluation of EARMARK efficiency when dealing with multiple hierarchies in comparison with the XML structures used by popular XML‐based formats such as Office Open and MS Word.</p>
          </listItem>
        </list>
      </p>
      <p>The article is structured as follows: First, we provide a brief overview of existing approaches to overlap using workarounds in XML or ad hoc markup metalanguages, and then give a few examples of situations where overlaps are used today and sometimes in rather mainstream situations. Next, we present the EARMARK model and its rules. Then, we provide some use cases that are meant to demonstrate the superiority of the EARMARK approach to a traditional XML format, especially when overlaps come into question, and show the generation of EARMARK documents, converting legacy documents. An initial evaluation of the efficiency of EARMARK compared to popular XML based data formats such as Open Document (ODT) and Office Open XML (OOXML) is presented, followed by our onclusions.</p>
    </section>
    <section xml:id="sec1-2">
      <title type="main">Existing Approaches to Overlapping</title>
      <p>The need for multiple overlapping structures over documents using markup syntaxes such as XML and SGML is an age‐old issue, and a large amount of literature exists on the techniques, languages, and tools that allow users to create multiple entangled hierarchies over the same content. A good review can be found in DeRose (
        <link href="#bib8">2004</link>).</p>
      <p>Some research has proposed using plain hierarchical markup (i.e., XML) and employing specially tailored elements or attributes to express the semantics of overlapping in an implicit way. The TEI Guidelines (TEI Consortium,
        <link href="#bib36">2005</link>) presented a number of different techniques that use SGML/XML constructs to force multiple hierarchies into a single one, including:
        <list xml:id="l2" style="bulleted">
          <listItem>
            <p><i>milestones</i> (i.e., verlapping structures are expressed through empty elements to mark the boundaries of the “content”),</p>
          </listItem>
          <listItem>
            <p><i>fragmentation</i> (i.e., overlapping structures are split into individual, nonoverlapping elements that may even be linked through <i>id–idref</i> pairs), and</p>
          </listItem>
          <listItem>
            <p><i>standoff markup</i> (i.e., overlapping structures are placed elsewhere and indirectly refer to their would‐be locations through pointers, locators, and/or <i>id–idref</i> pairs).</p>
          </listItem>
        </list>
      </p>
      <p>Given the large number of techniques to deal with overlapping structures in XML, in Marinelli, Vitali, and Zacchiroli (
        <link href="#bib25">2008</link>), we presented a number of algorithms to convert XML documents with overlapping structures from and to the most common approaches, as well as a prototype implementation.</p>
      <p>Riggs (
        <link href="#bib29">2002</link>) introduced a slightly different technique for fragmentation within XML structures. In this proposal, <i>floating elements</i> (i.e., those elements that do not fall in a proper or meaningful hierarchical order) are created using the name of the element followed by an index referring to its semantically related parent element. For example, the floating element &lt;name.person[2]&gt;John&lt;/name.person [2] means that &lt;name&gt;John&lt;/name&gt; is semantically a child of the second occurrence of the element <i>person</i>, even though the floating element is not structurally contained by its logical parent.</p>
      <p>Other research even has proposed to get rid of the theory of trees at the base of XML/SGML altogether and use different underlying models and newly invented XML‐like languages that allow the expression of overlaps through some kind of syntactical flourishing. For instance, a general ordered‐descendant directed acyclic graph (<i>GODDAG;</i> Sperberg‐McQueen &amp; Huitfeldt,
        <link href="#bib35">2004</link>) is a family of graph‐theoretical data structures to handle overlapping markup. A GODDAG's nodes represent markup elements and text. Arcs are used to explicitly represent containment and father–child relations. Since multiple arcs can be directed to the same node, overlapping structures can be straightforwardly represented in GODDAG. Full GODDAGs cannot be linearized in any form using embedded markup, but <i>restricted GODDAGs</i>, a subset thereof, can be and have been linearized into TexMecs (Huitfeldt &amp; Sperberg‐McQueen,
        <link href="#bib21">2003</link>), a multihierarchical markup language that also allows full GODDAGs through appropriate nonembedding workarounds such as standoff markup.</p>
      <p><i>LMNL</i> (Tennison &amp; Piez,
        <link href="#bib37">2002</link>) is a general data model based on the idea of layered <i>text fragments and ranges</i>, where multiple types of overlap can be modeled using concepts drawn from the mathematical theory of intervals. Multiple serializations of LMNL exist, such as CLIX and LMNL‐syntax.</p>
      <p><i>XConcur</i> (Schonefeld &amp; Witt,
        <link href="#bib33">2006</link>) is a similar solution based on the representation of multiple hierarchies within the same document through <i>layers</i>. Strictly related to its predecessor CONCUR as it was included in the SGML, XConcur was developed in conjunction with the validation language XConcur‐CL to handle relationships and constraints between multiple hierarchies.</p>
      <p>The <i>variant graph</i> approach (Schmidt &amp; Colomb,
        <link href="#bib32">2009</link>) also is based on graph theory. Developed to deal with textual variations that generate multiple versions of the same document with multiple overlapping hierarchies, this theory proposes a new data model to represent literary documents and a graph linearization (based on lists) that scales well even with a large number of versions. The same authors recently presented an extension of their theory that also allows users to merge multiple variants into one document (Schmidt,
        <link href="#bib31">2009</link>). In Portier and Calabretto (
        <link href="#bib28">2009</link>), a detailed survey about overlapping approaches was presented, and also discussed the MultiX<sup>2</sup> data model, which uses W3C standard languages such as XInclude to link and fetch text fragments within overlapping structures, and a prototype editor for the creation of multistructured documents.</p>
      <p>Tummarello, Morbidoni, and Pierazzo (
        <link href="#bib38">2005</link>) proposed using RDF as a standoff notation for overlapping structures of XML documents. Since this proposal has many affinities with the one we are presenting in this article, we later discuss its characteristics and compare it with ours.</p>
    </section>
    <section xml:id="sec1-3">
      <title type="main">More Frequent Than One May Think: Overlapping in the Wild</title>
      <p>Overlapping structures have been considered often as appropriate only in highly specific contexts and basically for scholars: The solutions that have been proposed in the literature were complex since they were considered grounded in the intrinsic complexity of the topics themselves. Yet, overlapping structures can be found in many more fields than these, and even mainstream applications generate and use markup with overlapping structures. While the complexity of overlapping is hidden to the final user, applications that consume such data may very well find it rather difficult to handle such information. We next discuss three very different contexts where overlapping already exists and fairly relevant information is encoded in multiple independent structures, leaving to special code the task of managing the complexity.</p>
      <section xml:id="sec2-1">
        <title type="main">Change Tracking in Office Document Formats</title>
        <p>Word processors such as Microsoft Word and Open Office provide users with powerful tools for tracking changes, allowing each individual modification by individual authors to be identified, highlighted, and acted upon (e.g., by accepting or discarding them). The intuitiveness of the relevant interfaces actually hides the complexity of the data format and of the algorithms necessary to handle such information.</p>
        <p>For instance, the standard ODT format (JTC1/SC34 WG 6,
          <link href="#bib23">2006</link>) used by Open Office, when saving change‐tracking information, relies on two specific constructs for insertions and deletions that may overlap with the structural markup. Adding a few words within a paragraph is not in itself complex, as it does not require the breaking of the fundamental structural hierarchy; conversely, changes that affect the structure itself (e.g., the split of one paragraph in two by the insertion of a return character, or the joining of two paragraphs by the elimination of the intermediate return character) require that annotations are associated to the end of a paragraph and the beginning of the next, in an unavoidably overlapping pattern. ODT uses milestones and standoff markup for insertions and deletions, respectively, and also relies on standoff markup for annotations about the authorship and date of the change.</p>
        <p>For instance, the insertion of a return character and a few characters in a paragraph creates a structure as follows:</p>
        <p>
          <displayedItem xml:id="di-ueqn-1" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq001" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>The empty elements &lt;text:change‐start/&gt; and &lt;text:change‐end/&gt; are <i>milestones</i> marking the beginning and the end, respectively, of the range that constituted the insertion while the element &lt;text:insertion&gt;, before the beginning of the document content, is <i>standoff markup</i> for the metadata about the change (author and date information).</p>
        <p>Similarly, a deletion creates a structure as follows:</p>
        <p>
          <displayedItem xml:id="di-ueqn-2" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq002" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>The element &lt;text:change/&gt; represents a <i>milestone</i> of the location where the deletion took place in the content, and the corresponding <i>standoff markup</i> annotation &lt;text:deletion&gt; contains not only the metadata about the change but also the text that was deleted.</p>
        <p>The OOXML format (JTC1/SC34 WG 4,
          <link href="#bib22">2008</link>) (the XML‐based format used by Microsoft Office 2007 and standardized by ISO in 2008), on the other hand, uses a form of <i>segmentation</i> to store change‐tracking information across all previous elements involved.</p>
        <p>
          <displayedItem xml:id="di-ueqn-3" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq003" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>This heavily simplified version of an OOXML document shows two separate changes: (a) the insertion of a return character and (b) the insertion of a word. These modifications are not considered as a single change; therefore, the segments are not connected to each other but simply created as needed to fit the underlying structure.</p>
        <p>In fact, change tracking in OOXML is a fairly complex proposition. Although providing more complete coverage of special cases and situations than does ODT, dealing with its intricacies is not for the casual programmer. Even a simple XSLT stylesheet to show inserted text in a different color and hide deleted text may run several hundred lines of code.
          <link href="#note1">1</link>
          <note xml:id="note1">
            <p>
              <url href="http://OOXMLdeveloper.org/archive/2006/09/07/625.aspx">http://OOXMLdeveloper.org/archive/2006/09/07/625.aspx</url>
            </p>
          </note>
        </p>
      </section>
      <section xml:id="sec2-2">
        <title type="main">Overlapping With Microformats</title>
        <p>Microformats (Allsopp,
          <link href="#bib2">2007</link>) add semantic markup to web documents by using common structures of the HTML language itself—in particular, the <i>class</i> attribute.</p>
        <p>The HTML code is annotated using microformats to provide new semantic, machine‐processable assertions. In the following example, a plain HTML table is enriched with metadata about events
          <link href="#note2">2</link>
          <note xml:id="note2">
            <p>HCalendar,
              <url href="http://microformats.org/wiki/hcalendar">http://microformats.org/wiki/hcalendar</url>
            </p>
          </note>
          and people:
          <link href="#note3">3</link>
          <note xml:id="note3">
            <p>HCard,
              <url href="http://microformats.org/wiki/hcard">http://microformats.org/wiki/hcard</url>
            </p>
          </note>
        </p>
        <p>
          <displayedItem xml:id="di-ueqn-4" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq004" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>The table was enriched by additional data declaring it to be an event (a conference), and data about the event itself (URL, summary, location) and about four relevant individuals (with their names and roles within the conference) were associated where necessary to the actual content of the table.</p>
        <p>So far, so good, and no overlap to speak about. Things change dramatically, though, when the overall structure of the main hierarchy (the HTML table) is at odds with the intrinsic hierarchy of the microformat data, such as if the people are organized in columns rather than rows. For instance:</p>
        <p>
          <displayedItem xml:id="di-ueqn-5" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq005" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>Unfortunately, vcards are a hierarchy themselves, and if the hierarchy of vcards is organized differently from the hierarchy of the HTML table, as in the latter case, it is just impossible to define the four vcards for the four people organizing the conference. Thus, in plain HTML, the choice of one of two possible presentation models for the main hierarchy of content makes trivial or completely impossible the existence of the second hierarchy.</p>
        <p>A possible and partial solution to express vcard hierarchies in the latter example is RDFa (Adida et al.,
          <link href="#bib1">2008</link>), a W3C recommendation. It describes a mechanism to embed RDF statements into HTML documents by using some HTML attributes (<i>href, rel, rev, content</i>) in combination with other ad hoc attributes (<i>property, about, typeof</i>) proposed in the recommendation itself.</p>
        <p>
          <displayedItem xml:id="di-ueqn-6" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq006" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>Since all attributes live in the context of elements, the price to pay is that to assert everything we want to assert, we often need to add some structurally unnecessary elements to the current markup hierarchy of a document, needed only to add the RDF statements (e.g., the <i>span</i> elements emphasized earlier). Even if that does not represent a significant problem for strict Semantic Web theorists, document architects and markup experts see this as a kludge and an inelegant compromise.</p>
      </section>
    </section>
    <section xml:id="sec1-4">
      <title type="main">Wikis: No Overlapping Where Some Should Be</title>
      <p>The strength of wikis lies in their allowing users to modify content at any time. The mechanisms of change‐tracking and rollback that are characteristics of all wikis, in fact, promote users' contributions and make “malicious attacks” pointless in the long run since previous versions can be easily restored.</p>
      <p>A number of tools exist that automatically discover “wiki vandalisms” and provide users with powerful interfaces to surf changes, identify differences between subsequent versions, and revert content. For instance, Huggle
        <link href="#note4">4</link>
        <note xml:id="note4">
          <p>
            <url href="http://en.wikipedia.org/wiki/Wikipedia:Huggle">http://en.wikipedia.org/wiki/Wikipedia:Huggle</url>
          </p>
        </note>
        is an application dealing with vandalism in Wikipedia, based on a proxy architecture and .NET technologies. A straightforward interface allows users to access any version of a page, highlights contributions of a specific user, and reverts the content to old versions.</p>
      <p>Even client‐side tools—meant to be installed as browser extensions or bookmarklets—exist to extend the rollback mechanisms of Wikipedia, giving users more flexibility and control over (vandalistic) changes. For instance, Lupin
        <link href="#note5">5</link>
        <note xml:id="note5">
          <p>
            <url href="http://en.wikipedia.org/wiki/User:Lupin/Anti-vandal_tool">http://en.wikipedia.org/wiki/User:Lupin/Anti‐vandal_tool</url>
          </p>
        </note>
        is a set of javascript scripts that check a wiki page against a list of forbidden terms so that authors can identify undesirable modifications and restore previous (i.e., good) versions without a continuous control over the full content of the page; yet again, Twinkle
        <link href="#note6">6</link>
        <note xml:id="note6">
          <p>
            <url href="http://en.wikipedia.org/wiki/Wikipedia:Twinkle">http://en.wikipedia.org/wiki/Wikipedia:Twinkle</url>
          </p>
        </note>
        provides users powerful rollback functions and includes a full library of batch deletion functions, automatic reporting of vandals, and user notification functions.</p>
      <p>These tools are successful in highlighting vandalism and in identifying versions created by malicious users. However, although it is possible to revert the page to any previous version, all changes (even acceptable ones) that were subsequent to the malicious version cannot be automatically inherited by the restored page.</p>
      <p>For instance consider Versions V1, V2, and V3 of a wiki page, where Version V1 contains a baseline (i.e., acceptable) content, and Version V2 is identified as a partial vandalism and is agreed to be removed, but Version V3 contains (possibly, in a completely different section than the target of the malicious attack) relevant and useful content that was added before the vandalistic Version V2 was declared as such. The task of removing the modifications of Version V2 while maintaining (whatever is possible of) Version V3 is a difficult, error‐prone, and time‐consuming task if done manually, yet there is no tool we are aware of that automatically filters contributions from multiple versions and merges them into a new one (or, equivalently, removes only selected intermediate versions).</p>
      <p>However, it is possible to theoretically characterize the interdependencies between subsequent changes to a document. In fact, literature has existed for a long time on exactly these themes (e.g., Durand,
        <link href="#bib14">1994</link>,
        <link href="#bib15">2008</link>). Although a detailed discussion of abstract models of interconnected changes is out of scope for this article (Details and authoritative references can be found in the aforementioned works.), what is relevant in this discussion is that they happen to assume a hierarchical form that is frequently at odds with the hierarchical structure of the content of the document, and as such, most issues derive from the data structures in which content is stored and from the model for manipulating these structures. For instance, the fact that in the wiki perspective each version is an independent unit that shares no content (even unchanged content) with the other versions prevents considering multiple versions as overlapping structures coexisting on the same document. If we were able to make these hierarchies explicit, we would be able to create models and tools to manipulate these documents in a more powerful way and to exploit the existing interconnections between the overlapping hierarchies.</p>
    </section>
    <section xml:id="sec1-5">
      <title type="main">Introduction to EARMARK and Its Support for Overlapping Features</title>
      <p>The presence of hidden overlapping structures—transparent to users, but very difficult to handle by applications—is the common denominator for the scenarios described in the previous section. More than the overlap itself, which cannot be ignored because it does exist and carries important meanings, the problem we face lies in the way applications store such overlapping structures. In the XML world, in fact, the only way to do so is through the use of (complex) workarounds that force the multiple hierarchies into one hierarchy of an XML document. That makes it very tricky to perform sophisticated analysis and searches.</p>
      <p>This section discusses a different approach to metamarkup, EARMARK (Di Iorio, Peroni, &amp; Vitali,
        <link href="#bib10">2009</link>; Di Iorio et al.,
        <link href="#bib11">2010</link>; Peroni &amp; Vitali,
        <link href="#bib27">2009</link>) based on ontologies and Semantic Web technologies. The basic idea is to model EARMARK documents as collections of addressable text fragments, and to associate such text content with OWL assertions that describe structural features as well as semantic properties of (parts of) that content. As a result, EARMARK allows not only documents with single hierarchies (as with XML) but also multiple overlapping hierarchies where the textual content within the markup items belongs to some hierarchies, but not to others. Moreover, EARMARK makes it possible to add semantic annotations to the content though assertions that may overlap with existing ones.</p>
      <p>One of the advantages of using EARMARK is the capability to access and query documents by using well‐known and widely supported tools for the Semantic Web. In fact, EARMARK assertions are simply RDF assertions while EARMARK documents are modeled through OWL ontologies. The consequence is that query languages (e.g., SPARQL; Garlik &amp; Seaborne,
        <link href="#bib16">2010</link>) and actual existing tools such as Jena
        <link href="#note7">7</link>
        <note xml:id="note7">
          <p>
            <url href="http://jena.sourceforge.net">http://jena.sourceforge.net</url>
          </p>
        </note>
        and Pellet
        <link href="#note8">8</link>
        <note xml:id="note8">
          <p>
            <url href="http://pellet.owldl.com">http://pellet.owldl.com</url>
          </p>
        </note>
        can be directly used to deal with even incredibly complicated overlapping structures. What is very difficult (or impossible) to do with traditional XML technologies becomes much easier with these technologies under the EARMARK approach.</p>
      <p>In the rest of this section, we give a brief overview of the EARMARK model and then describe how EARMARK can be used to deal with the issues presented earlier. The model itself is defined through an OWL document,
        <link href="#note9">9</link>
        <note xml:id="note9">
          <p>
            <url href="http://www.essepuntato.it/2008/12/earmark">http://www.essepuntato.it/2008/12/earmark</url>
          </p>
        </note>
        summarized in Figure
        <link href="#fig1">1</link>, specifying classes and relationships. We distinguish between <i>ghost classes</i>, which define the general model, and <i>shell classes</i>, which are actually used to create EARMARK instances.</p>
      <figure xml:id="fig1">
        <label>1</label>
        <mediaResourceGroup>
          <mediaResource alt="image" eRights="yes" copyright="Wiley Periodicals, Inc." href="urn:x-wiley:15322882:media:ASI21591:fig001" />
          <mediaResource alt="thumbnail image" rendition="webLoRes" mimeType="image/gif" href="" />
          <mediaResource alt="original image" rendition="webOriginal" mimeType="image/jpeg" href="" />
          <mediaResource alt="magnified image" rendition="webHiRes" mimeType="image/jpeg" href="" />
        </mediaResourceGroup>
        <caption>
          <p>A UML‐like representation of the EARMARK ontology. [Color figure can be viewed in the online issue, which is available at
            <url href="http://wileyonlinelibrary.com">wileyonlinelibrary.com</url>.]</p>
        </caption>
      </figure>
      <section xml:id="sec2-3">
        <title type="main">Ghost Classes</title>
        <p>The ghost classes describe three disjoint base concepts—<i>docuverses, ranges</i>, and <i>markup items</i>—through three different and disjoint OWL classes.
          <link href="#note10">10</link>
          <note xml:id="note10">
            <p>All our OWL samples are presented using the Manchester Syntax (Horridge &amp; Patel‐Schneider,
              <link href="#bib19">2009</link>), which is one of the standard linearization syntaxes of OWL. The prefixes <i>rdfs</i> and <i>xsd</i> refer to RDF Schema and XML Schema namespaces, respectively, while the empty prefix refers to the EARMARK ontology URI plus “#.” Moreover, we use the prefix <i>c</i> to indicate entities taken from an imported ontology made for the SWAN project (Ciccarese et al.,
              <link href="#bib7">2008</link>); available at
              <url href="http://swan.mindinformatics.org/spec/1.2/collections.html">http://swan.mindinformatics.org/spec/1.2/collections.html</url>
            </p>
          </note>
        </p>
        <p>The textual content of an EARMARK document is conceptually separated from its annotations, and is referred to through the <i>Docuverse</i> class.
          <link href="#note11">11</link>
          <note xml:id="note11">
            <p>This class (and its name) is based on the concept introduced by Ted Nelson (
              <link href="#bib26">1980</link>) in his Xanadu Project to refer to the collection of text fragments that can be interconnected to each other and transcluded into new documents.</p>
          </note>
          The individuals of this class represent the object of discourse (i.e., all the containers of text of an EARMARK document).</p>
        <p>
          <displayedItem xml:id="di-ueqn-7" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq007" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>Any individual of the <i>Docuverse</i> class—commonly called a <i>docuverse</i> (lowercase to distinguish it from the class)—specifies its actual content with the property <i>hasContent</i>.</p>
        <p>We then define the class <i>Range</i> for any text lying between two locations of a docuverse. A <i>range</i> (i.e., an individual of the class <i>Range</i>) is defined by a starting and an ending location (any literal) of a specific docuverse through the properties <i>begins, ends</i>, and <i>refersTo</i>, respectively.</p>
        <p>
          <displayedItem xml:id="di-ueqn-8" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq008" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>There is no restriction on locations used for the <i>begins</i> and <i>ends</i> properties. That is very useful because it allows us to define ranges that <i>follow</i> or <i>reverse</i> the text order of the docuverse to which they refer. For instance, the string “desserts” can be considered both in document order, with the <i>begins</i> location lower than the <i>ends</i> location, or in the opposite order, forming “stressed.”
          <link href="#note12">12</link>
          <note xml:id="note12">
            <p>
              <url href="http://en.wikipedia.org/wiki/Palindrome#Semordnilaps">http://en.wikipedia.org/wiki/Palindrome#Semordnilaps</url>
            </p>
          </note>
          Thus, the values of the properties' <i>begins</i> and <i>ends</i> define the way a range must be read.</p>
        <p>The class <i>MarkupItem</i> is the superclass defining artifacts to be interpreted as markup (e.g., elements and attributes).</p>
        <p>
          <displayedItem xml:id="di-ueqn-9" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq009" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>A <i>markupitem</i> individual is a collection (c:Set, c:Bag, or c:List, where the latter is a subclass of the second one, and all of them are subclasses of c:Collection) of individuals belonging to the classes <i>MarkupItem</i> and <i>Range</i>. Through these collections, it is possible to define a markup item as a set, a bag, or a list of other markup items, using the properties <i>element</i> (for sets) and <i>item</i> and <i>itemContent</i> (for bags and lists). Thus, it becomes possible to define elements containing nested elements or text, or attributes containing values, as well as overlapping and complex structures. Note also that handling collections directly in OWL allows us to reason about content models for markup items, which would not be possible if we had used the corresponding constructs in RDF.
          <link href="#note13">13</link>
          <note xml:id="note13">
            <p>
              <url href="http://hcklab.blogspot.com/2008/12/moving-towards-swan-collections.html">http://hcklab.blogspot.com/2008/12/moving‐towards‐swan‐collections.html</url>
            </p>
          </note>
        </p>
        <p>A <i>markupitem</i> also might have a name, specified in the functional property <i>hasGeneralIdentifier</i> (recalling the SGML term to refer to the name of elements; Goldfarb,
          <link href="#bib18">1990</link>), and a namespace, specified using the functional property <i>hasNamespace</i>. Note that we can have <i>anonymous</i> markup items—as is possible in LMNL (Tennison &amp; Piez,
          <link href="#bib37">2002</link>) and in GODDAG (Sperberg‐McQueen &amp; Huitfeldt,
          <link href="#bib35">2004</link>)—by simply asserting that the item belongs to the class of all those markupitems that do not have a general identifier (i.e., hasGeneralIdentifier exactly 0).</p>
      </section>
      <section xml:id="sec2-4">
        <title type="main">Shell Classes</title>
        <p>The ghost classes discussed so far give us an abstract picture of the EARMARK framework. We need to specialize our model, defining a concrete description of our classes. These new <i>shell</i> subclasses apply specific restrictions to the ghost classes.</p>
        <p>First, the class <i>Docuverse</i> is restricted to be either a <i>StringDocuverse</i> (i.e., the content is specified by a string) or a <i>URIDocuverse</i> (i.e., the actual content is located at the URI specified).</p>
        <p>
          <displayedItem xml:id="di-ueqn-10" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq010" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>Depending on particular scenarios or on the kind of docuverse we are dealing with (plain‐text, XML, LaTeX, a picture, etc.), we need to use different kinds of ranges. Therefore, the class <i>Range</i> has three different subclasses:
          <list xml:id="l3" style="bulleted">
            <listItem>
              <p><i>PointerRange</i> defines a range by counting characters. In that case, the value of the properties' <i>begins</i> and <i>ends</i> must be a nonnegative integer that identifies unambiguous positions in the character stream, remembering that the value <i>0</i> refers to the location immediately before the first character, the value <i>1</i> refers to the location after the first character and before the second one, and so on. By using the <i>hasKey</i> OWL property, we also assert that two pointer ranges having equal docuverse and begin and end locations are the same range.</p>
            </listItem>
            <listItem>
              <p><i>XPathRange</i> defines a range considering the whole docuverse or its particular context specifiable through an XPath expression (Berglund et al.,
                <link href="#bib5">2007</link>) as value of the property <i>hasXPathContext</i>. Note that by using these ranges, we implicitly admit that the docuverse it refers to must be an XML structure. Moreover, the properties' <i>begins</i> and <i>ends</i> have to be applied on the string value obtained by juxtaposing all the text nodes identified by the XPath. By using the <i>hasKey</i> OWL property, we also assert that two xpath ranges having equal docuverse, XPath context, and begin and end locations are the same range.</p>
            </listItem>
            <listItem>
              <p><i>XPathPointerRange</i> is an XPathRange in which the value of the properties' <i>begins</i> and <i>ends</i> must be a nonnegative integer that identifies unambiguous positions in the character stream as described for the class <i>PointerRange</i>.</p>
            </listItem>
          </list>
        </p>
        <p>
          <displayedItem xml:id="di-ueqn-11" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq011" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p><i>MarkupItem</i> is specialized in three disjointed subclasses—<i>Element, Attribute</i>, and <i>Comment</i>—that allow a more precise characterization of markup items.</p>
        <p>
          <displayedItem xml:id="di-ueqn-12" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq012" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
      </section>
      <section xml:id="sec2-5">
        <title type="main">Range and Markup Item Overlap</title>
        <p>The presence of overlap in EARMARK is worth discussing in more detail. Different types of overlap exist, according to the subset of items involved, and different strategies are needed to detect them. In particular, there is a clear distinction between <i>overlapping ranges</i> and <i>overlapping markup items</i>.</p>
        <p>By definition, <i>overlapping ranges</i> are two ranges that refer to the same docuverse, so that at least one of the locations of the first range is contained in the interval described by the locations of the second range (excluding its terminal points). <i>Totally overlapping ranges</i> have the locations of the first range completely contained in the interval of the second range, or vice versa, while <i>partially overlapping ranges</i> have either exactly one location inside the interval and the other outside or identical terminal points in reversed roles.</p>
        <p>Thus, if we consider the following excerpt:</p>
        <p>
          <displayedItem xml:id="di-ueqn-13" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq013" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>we can infer, through a reasoner such as Pellet, that these two ranges overlap by using the following rules:</p>
        <p>
          <displayedItem xml:id="di-ueqn-14" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq014" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>where <i>P</i> is one of:
          <list xml:id="l4" style="bulleted">
            <listItem>
              <p>lessThan(b1,e1) ˆ greaterThan(b2,b1) ˆ lessThan(b2,e1)</p>
            </listItem>
            <listItem>
              <p>lessThan(b1,e1) ˆ greaterThan(e2,b1) ˆ lessThan(e2,e1)</p>
            </listItem>
            <listItem>
              <p>lessThan(e1,b1) ˆ greaterThan(b2,e1) ˆ lessThan(b2,b1)</p>
            </listItem>
            <listItem>
              <p>lessThan(e1,b1) ˆ greaterThan(e2,e1) ˆ lessThan(e2,b1).</p>
            </listItem>
          </list>
        </p>
        <p>The case of <i>overlapping markup items</i> is slightly more complicated. We define that two markup items <i>A</i> and <i>B</i> <i>overlap</i> when at least one of the following sentences holds:
          <list xml:id="l5" style="bulleted">
            <listItem>
              <p>[<b>Overlap by range</b>]: <i>A</i> contains a range that overlaps with another range contained by <i>B</i>.</p>
            </listItem>
            <listItem>
              <p>[<b>Overlap by content hierarchy</b>]: <i>A</i> and <i>B</i> contain at least a range in common.</p>
            </listItem>
            <listItem>
              <p>[<b>Overlap by markup hierarchy</b>]: <i>A</i> and <i>B</i> contain at least a markup item in common.</p>
            </listItem>
          </list>
        </p>
        <p>The three possible scenarios for such item overlap are summarized in Figure
          <link href="#fig2">2</link>.
          <link href="#note14">14</link>
          <note xml:id="note14">
            <p>The EARMARK documents describing these three overlapping scenarios and all the other ones presented in the following sections are available at
              <url href="http://www.essepuntato.it/2011/jasist/examples">http://www.essepuntato.it/2011/jasist/examples</url>
            </p>
          </note>
        </p>
        <figure xml:id="fig2">
          <label>2</label>
          <mediaResourceGroup>
            <mediaResource alt="image" eRights="yes" copyright="Wiley Periodicals, Inc." href="urn:x-wiley:15322882:media:ASI21591:fig002" />
            <mediaResource alt="thumbnail image" rendition="webLoRes" mimeType="image/gif" href="" />
            <mediaResource alt="original image" rendition="webOriginal" mimeType="image/jpeg" href="" />
            <mediaResource alt="magnified image" rendition="webHiRes" mimeType="image/jpeg" href="" />
          </mediaResourceGroup>
          <caption>
            <p>Three EARMARK examples of overlapping between elements <i>p</i>. [Color figure can be viewed in the online issue, which is available at
              <url href="http://wileyonlinelibrary.com">wileyonlinelibrary.com</url>.]</p>
          </caption>
        </figure>
        <p>The EARMARK ontology, in fact, is completed by another ontology
          <link href="#note15">15</link>
          <note xml:id="note15">
            <p>
              <url href="http://www.essepuntato.it/2011/05/overlapping">http://www.essepuntato.it/2011/05/overlapping</url>
            </p>
          </note>
          that models all overlapping scenarios, either for ranges or markup items, and includes rules for automatically inferring overlaps through a reasoner.</p>
      </section>
      <section xml:id="sec2-6">
        <title type="main">EARMARK as a Standoff Notation</title>
        <p>If we ignore for a moment the semantic implications of using EARMARK and concentrate on its syntactical aspects only, it is easy to observe that EARMARK is nothing but yet another standoff notation, where the markup specifications point to, rather than contain, the relevant substructure and text fragments.</p>
        <p>Standoff notations, also known in literature as out‐of‐line notations (TEI Consortium,
          <link href="#bib36">2005</link>), are hardly new, but never really caught on for a number of reasons, most having to do with their perceived fragility under the circumstances of desynchronized modification to the text. In Georg, Schonefeld, Trippel, and Witt (
          <link href="#bib17">2010</link>) and Bański (
          <link href="#bib3">2010</link>), we can find a pair of recent and substantially complete analyses of their merits and demerits. In particular, according to Georg, Schonefeld, Trippel, and Witt (
          <link href="#bib17">2010</link>), “standoff annotation has … quite a few disadvantages:
          <list xml:id="l6" style="bulleted">
            <listItem>
              <p>very difficult to read for humans</p>
            </listItem>
            <listItem>
              <p>The information, although included, is difficult to access using generic methods.</p>
            </listItem>
            <listItem>
              <p>Limited software support as standard parsing or editing software cannot be employed.</p>
            </listItem>
            <listItem>
              <p>Standard document grammars can be used only for the level which contains both markup and textual data.</p>
            </listItem>
            <listItem>
              <p>New layers require a separate interpretation.</p>
            </listItem>
            <listItem>
              <p>Layers, although separate, often depend on each other.”
                <link href="#note16">16</link>
                <note xml:id="note16">
                  <p>To individually address the issues, we edited the original bullets into a numbered list.</p>
                </note>
              </p>
            </listItem>
          </list>
        </p>
        <p>And yet, although EARMARK <i>is</i> in practice a standoff notation, it provides a number of workarounds to most of the aforementioned issues.</p>
        <p>First, since EARMARK is based on OWL and can be linearized in any of the large number of OWL caricaturization syntaxes, it follows that (a) readability, (b) access, and (c) software support for it are exactly those existing for well‐known, widespread, and important W3C standards such as RDF and OWL. Being able to employ common RDF and OWL tools such as Jena and SPARQL for EARMARK documents was in fact a major motivation for it.</p>
        <p>Issue 4 should be examined beyond the mere validation against document grammars and toward a general evaluation of the level of compliancy of the markup to some formally specified expectations. EARMARK documents, while being subject to no document grammar in the stricter XML sense, allow the specification of any number of constraints, expressed either directly in OWL or SWRL (Horrocks et al.,
          <link href="#bib20">2004</link>), or even in SPARQL, that trigger or generate validity evaluations. In Di Iorio, Peroni, and Vitali (in press), we tried to show that a large number of requirements, from hierarchical well‐formedness in the XML sense, to validation requirements in terms of XML DTDs, to adherence to design patterns, can be expressed satisfactorily using these technologies.</p>
        <p>Issue 5 regards the difficulty of standoff notations to provide interlayer analysis on XML structures: Separate interpretation of markup layers is easy, but identification and validation of overlapping situations are more complex: Standoff markup is mainly composed of pointers to content and does not have any direct way to determine overlap locations without some kind of pointer arithmetics to compute them. Validation of contexts allowing overlaps as describable using rabbit/duck grammars (Sperberg‐McQueen,
          <link href="#bib34">2006</link>) also is not trivial. In this regard, EARMARK provides yet again a solution that does not require special tools: Although OWL does not allow direct pointer arithmetics, SWRL on the contrary does, as shown earlier where we described a batch of (SWRL‐implementable) rules that do, in fact, determine overlapping locations on EARMARK documents with good efficiency.</p>
        <p>Finally, Issue 6 refers to the fact that evolution of separate markup annotation layers needs to synchronously take place, lest one of them becomes misaligned with the new state of the document. This is, in summary, the <i>fragility of pointers</i>, which can be considered the fundamental weakness of standoff, as well as of any notation that has markup separate from its content: If a modification occurs to the underlying (probably text‐based) source, all standoff pointers that could not be updated at the same time of the change become outdated and possibly wrong. All standoff notations fall prey to this weakness, and there is no way to completely get rid of it.</p>
        <p>What is possible is to identify exactly the conditions under which such weakness acts, and see if there is a way to reduce the mere frequency of such events. In fact, for a standoff pointer to become outdated, several conditions must take place at the same time:
          <list xml:id="l7" style="bulleted">
            <listItem>
              <p>The standoff notation must be used as a storage format, rather than just as a processing format;</p>
            </listItem>
            <listItem>
              <p>the source document must make sense even without the additional standoff markup (i.e., the standoff notation contains no information that is necessary for at least some types of document modifications);</p>
            </listItem>
            <listItem>
              <p>the source document must be editable (and, in fact, must be edited) on its own;</p>
            </listItem>
            <listItem>
              <p>the standoff pointers must rely on positions that change when the source is edited (e.g., character‐based locations);</p>
            </listItem>
            <listItem>
              <p>editing must be done in contexts and with tools that cannot or do not update the standoff pointers; and</p>
            </listItem>
            <listItem>
              <p>there must be no computable way to determine the modifications of the document (e.g., via a <i>diff</i> between the old and new versions).</p>
            </listItem>
          </list>
        </p>
        <p>Of course, no standoff notation can rule out that these conditions occur on their documents, but note that <i>all six</i> of them must occur for standoff pointers to become outdated. EARMARK is not safe from these occurrences either, but at least for the use cases here described, one or more of these conditions simply do not apply: EARMARK is mostly used as a processing format, with no need to save it on disk (Conversion from the source formats such as MS Word is described later and does not require special storage.), the data format described is either in a very specific format (e.g., MS Word or ODT) that in fact already does handle internally its data changes and requires the overlapping data exactly for this purpose, or is in fact the result of a <i>diff</i> action on successive versions of a document (as in the case of the wiki pages). Finally, EARMARK allows references to relatively stable fragment ids of the documents (by using XPath ranges without specifying explicitly begin and end locations) rather than the extremely fragile character locations, further reducing the chances of outdated pointers. For this reason, without being able to completely rule out the possibility of standoff pointers going wrong, we tend to consider it as a significantly little risk, at least for the use case described here.</p>
      </section>
      <section xml:id="sec2-7">
        <title type="main">Using OWL Versus RDF for Standoff Notations</title>
        <p>EARMARK is strongly based on OWL 2 DL (W3C OWL Working Group,
          <link href="#bib39">2009</link>) to express multiple markup layers with possible overlapping ranges over the same content. OWL 2 DL is not the only possible choice for expressing standoff notations via Semantic Web technologies. In fact, RDF is another valid and effective model for dealing with the same issue, as shown in Tummarello et al. (
          <link href="#bib38">2005</link>), by means of the open‐source application programming interface (API) <i>RDF Textual Encoding Framework</i> (<i>RDFTef</i>). This API was created to demonstrate a plausible way for handling overlapping markup within documents and identifying textual content of a document as a set of independent RDF resources that can be linked mutually and with other parent resources.</p>
        <p>Besides giving the possibility to define multiple structural markup hierarchies over the same text content, the use of RDF as the language for encoding markup allows to specify semantic data on textual content as well. But the real main advantage in using RDF is the possibility of using particular built‐in resources appositely defined in the RDF syntax specification (Beckett,
          <link href="#bib4">2004</link>) for describing and dealing with different kinds of containers, either ordered (rdf:Seq) or unordered (rdf:Bag). Thus, RDF resources can be used to represent every printable element in the text—words, punctuation, characters, typographical symbols, and so on—while RDF containers also can be used to combine such fragments and containers.</p>
        <p>Although RDF is not sufficient to define a formal vocabulary for structural markup, does a given resource represent an element, an attribute, a comment, or a text node? In which way is a resource of a certain type related to others? The specification of an RDFS (Brickley &amp; Guha,
          <link href="#bib6">2004</link>) or of an OWL layer can successfully address these issues. Hybrid solutions obtained by mixing different models, even when they are built one upon another, may seem elegant, but not necessarily the best choice. In fact, there exist well‐known interoperability limits between OWL 2 DL and RDF that prevent the correct use of Semantic Web tools and technologies. In particular:
          <list xml:id="l8" style="bulleted">
            <listItem>
              <p>Any markup document made using RDF containers (e.g., to describe what markup items contain and in which order) and OWL ontologies (e.g., to define classes of markup entities and their semantics) results in a set of axioms that end up outside of OWL DL and well within OWL Full, which limits the applicability of the most frequently used Semantic Web tools that are usually built upon the (computationally tractable) description logic underlying OWL 2 DL.</p>
            </listItem>
            <listItem>
              <p>The individual analysis of each language may be not applicable when we have to check particular properties that lie between RDF and OWL layers. For example, verifying the validity of a markup document against a particular schema, which is one of the most common activities with markup, needs to be made to work with both markup item structures (that would be defined in RDF) and logical constraints about classes of markup items (e.g., elements only, attributes only, the element “p,” all the element of a particular namespace, etc., all of them definable in OWL).</p>
            </listItem>
          </list>
        </p>
        <p>Being able to express everything we need directly in OWL quite straightforwardly addresses both issues. The well‐known absence of containers and sequences in OWL can be overcome by modeling classes in specific ways using specific design patterns such as those in Ciccarese et al. (
          <link href="#bib7">2008</link>) and in Drummond et al. (
          <link href="#bib13">2006</link>).</p>
      </section>
    </section>
    <section xml:id="sec1-6">
      <title type="main">Using EARMARK</title>
      <p>There are multiple applications for the EARMARK approach. The most interesting for this article is its capability of dealing with overlapping structures in an elegant and straightforward manner. Under EARMARK, such structures do not need to be specified through complex workarounds as with XML, but they are explicit and can be easily described and accessed. Sophisticated searches and content manipulations become very simple when using this ontological model.</p>
      <p>The goal of this section is to demonstrate the soundness and applicability of EARMARK by discussing how the use cases presented earlier are addressed. Note that throughout the section we investigate multiple EARMARK data structures and documents, focusing on the feasibility and potentiality of such an ontological representation.</p>
      <section xml:id="sec2-8">
        <title type="main">Looking for Authorial Changes in Office Documents</title>
        <p>The discussion about change tracking in office document formats showed that both ODT (OpenOffice format) and OOXML (Microsoft Word format) use complex data structures to store overlaps generated by change‐tracking functionalities. These structures make it very difficult to search and manipulate the content when using XML languages and tools. Even very simple edits generate a rather tangled set of overlapping elements.</p>
        <p>Let us recall the example mentioned earlier where the user “John Smith” splits a single paragraph into two. The ODT representation is:</p>
        <p>
          <displayedItem xml:id="di-ueqn-15" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq015" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>The OOXML representation (shown earlier) is even more complex. In fact, these formats exploit in large scale (tangled) fragmentation (OOXML) or milestones and stand‐off markup (ODT) to deal with overlaps.</p>
        <p>EARMARK, on the other hand, stores overlapping data in a direct and streamlined manner that does not require tools to rebuild information from the twists of a tree‐based XML structure. The information already is available and expressed through consistent RDF and OWL statements. Figure
          <link href="#fig3">3</link> graphically shows the corresponding EARMARK document.</p>
        <figure xml:id="fig3">
          <label>3</label>
          <mediaResourceGroup>
            <mediaResource alt="image" eRights="yes" copyright="Wiley Periodicals, Inc." href="urn:x-wiley:15322882:media:ASI21591:fig003" />
            <mediaResource alt="thumbnail image" rendition="webLoRes" mimeType="image/gif" href="" />
            <mediaResource alt="original image" rendition="webOriginal" mimeType="image/jpeg" href="" />
            <mediaResource alt="magnified image" rendition="webHiRes" mimeType="image/jpeg" href="" />
          </mediaResourceGroup>
          <caption>
            <p>Encoding in EARMARK the ODT change‐tracking example. [Color figure can be viewed in the online issue, which is available at
              <url href="http://wileyonlinelibrary.com">wileyonlinelibrary.com</url>.]</p>
          </caption>
        </figure>
        <p>The original paragraph content and the new string “also” are now encoded as two docuverses over which the ranges <i>r1, r2</i>, and <i>r3</i> are defined. The original paragraph is then composed of the (content of) ranges <i>r1</i> and <i>r2</i> while the paragraphs resulting after the (text and carriage return) insertion now comprise range <i>r1</i> and ranges <i>r2</i> and <i>r3</i>, respectively. Metadata about the author and the modification date are encoded as further RDF statements.</p>
        <p>
          <displayedItem xml:id="di-ueqn-16" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq016" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>The advantages of streamlining overlaps becomes apparent if we consider tasks a little beyond the mere display. For instance, the query for “the textual content of all paragraphs inserted by John Smith” ends up rather entangled if we used XPath on the ODT structure. The process for finding that textual content needs to browse the current version of the document and look for all the <i>text:change‐start/text:change‐end</i> pairs that refer to an insertion made by John Smith involving the creation of a new paragraph (i.e., <i>text:change‐start</i> is in a first paragraph while its pair, <i>text:change‐end</i>, is in the following one) that are either currently present in the document body or hidden behind a subsequent deletion made by someone else. Once the paragraphs are identified, we need to retrieve the content that originally was contained there (i.e., the text fragments that still are within those boundaries or that may have been deleted in subsequent versions). The following XPath represenst an implementation of this process:</p>
        <p>
          <displayedItem xml:id="di-ueqn-17" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq017" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>The XML structure of an MS Word file, using segmentation rather than milestones, does simplify the query a bit, but still presents some radical complexities. The process starts by choosing all those <i>w:p</i> elements that were inserted by John Smith as well as all their previous and contiguous <i>w:p</i> elements that were deleted before or inserted after the first ones. In OOXML, each sequence of contiguous <i>w:p</i> elements implicitly represents one paragraph. Therefore, we can now take all the text fragments contained in each <i>w:p</i> sequence that were inserted before or deleted after the paragraph defined by the sequence itself. The following is the resulting XPath for an OOXML document.</p>
        <p>
          <displayedItem xml:id="di-ueqn-18" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq018" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>The complexity of both XPath queries is due to the intrinsic complexity of the data structure on which the query has to work. Although the interface of OpenOffice or MS Word may provide tools to directly deal with these queries using specific strategies on the internal data structures, applications working directly on the XML structure have very little help in disentangling the mess of the data formats.</p>
        <p>On the other hand, since EARMARK documents are actually OWL files, it is possible to access and query them with plain Semantic Web tools. Powerful searches then can be performed without using niche‐specific tools or complex and long XPath expressions simply with mainstream technologies such as SPARQL (Garlik &amp; Seaborne,
          <link href="#bib16">2010</link>).</p>
        <p>The corresponding SPARQL query for (“the textual content of all paragraphs inserted by John Smith”) therefore can be written as follows:</p>
        <p>
          <displayedItem xml:id="di-ueqn-19" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq019" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>But EARMARK is useful for even more than querying: EARMARK also decreases the costs, in terms of efforts and lines of code, for manipulating documents.</p>
        <p>Let us consider the task of generating an intermediate version (i.e., neither the first nor the last one of a version chain) from a document that includes change‐tracking information about the whole document history.</p>
        <p>The process of rebuilding these versions by working on the XML structure without specific APIs is both complex and inefficient. For example, a basic XSLT that returns an XML document defining the desired version requires to at least:
          <list xml:id="l9" style="bulleted">
            <listItem>
              <p>define templates for all the elements actively involved in the change tracking (e.g., for ODT, <i>text:changed‐region, text:change‐start, text:change‐end</i>, and <i>text:change</i> and similarly for OOXML) to understand, by looking at their creation date, whether they must be considered or ignored when building the requested version. In particular, we must exclude insertions following and deletions preceding the version we are building;</p>
            </listItem>
            <listItem>
              <p>define templates for paragraphs to handle cases where the paragraph is the result of an insertion or a deletion of other paragraphs to identify whether it should be considered for the result and, in such case, finding out its real text content and remembering that in the following versions, such content may have spread out among other paragraphs;</p>
            </listItem>
            <listItem>
              <p>define templates for handling insertions/deletions for structures such as images, sections, lists, and tables; and</p>
            </listItem>
            <listItem>
              <p>define an identity template for the other elements to visit the entire document.</p>
            </listItem>
          </list>
        </p>
        <p>Even the most basic and incomplete implementation of such XSLT requires hundreds of lines of complex and convoluted code and a large number of ad hoc decisions based on the specificities of whether we start from ODT or OOXML. Note also that a Java‐based implementation (or in any other procedural language) of the same process would be equally or even more complex.</p>
        <p>The same result can be achieved on EARMARK documents with a few lines of Java code:</p>
        <p>
          <displayedItem xml:id="di-ueqn-20" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq020" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>This approach uses the EARMARK Java API
          <link href="#note17">17</link>
          <note xml:id="note17">
            <p>
              <url href="http://earmark.sourceforge.net">http://earmark.sourceforge.net</url>
            </p>
          </note>
          and a single SPARQL query, runnable on any SPARQL 1.1 processor such as Jena, to identify the root node of the subtree of the version that is associated with the specified date and creator. Then, it performs a simple, recursive, deep‐first visit to clone all the nodes in the tree and to combine them in the output EARMARK document.</p>
        <p>This method heavily uses Semantic Web technologies on the structures provided by EARMARK whose characteristics are always explicit and clear. In fact, since <i>all</i> versions coexist within the EARMARK document and each version can be encoded <i>explicitly</i> as a tree within the overall graph, this operation is straightforward and fast.</p>
      </section>
      <section xml:id="sec2-9">
        <title type="main">Improving Semantic Annotations</title>
        <p>EARMARK also can be exploited to improve semantic annotations. As noted earlier, there are in fact strong limitations in the same process of annotating web documents with semantic structures that overlap the structural ones. The same example—of <i>vcards</i> that cannot be created on the top of tables organized <i>per rows</i>—will be used in this section.</p>
        <p>We solve this by converting the web document with annotations into an EARMARK document, allowing both semantic and structural annotations to coexist. Through EARMARK, we can explicitly express both markup structures and <i>vcard</i> assertions. Figure
          <link href="#fig4">4</link> shows how the <i>vcard</i> example can be modeled (Once again, we show a graphical representation for the sake of clarity.)</p>
        <figure xml:id="fig4">
          <label>4</label>
          <mediaResourceGroup>
            <mediaResource alt="image" eRights="yes" copyright="Wiley Periodicals, Inc." href="urn:x-wiley:15322882:media:ASI21591:fig004" />
            <mediaResource alt="thumbnail image" rendition="webLoRes" mimeType="image/gif" href="" />
            <mediaResource alt="original image" rendition="webOriginal" mimeType="image/jpeg" href="" />
            <mediaResource alt="magnified image" rendition="webHiRes" mimeType="image/jpeg" href="" />
          </mediaResourceGroup>
          <caption>
            <p>The abstract model of the EARMARK document solving the microformats issue. [Color figure can be viewed in the online issue, which is available at
              <url href="http://wileyonlinelibrary.com">wileyonlinelibrary.com</url>.]</p>
          </caption>
        </figure>
        <p>The textual content of the original table cells is now encoded in two different docuverses: one for the header (with roles) and one for the body (with names of committee members). Ranges <i>r1, r2</i>, …, <i>r8</i> are then created to distinguish each role and name. Two independent and coexisting hierarchies are then built on top of the same set of ranges: the HTML table that includes one cell for each range (in blue) and the <i>vcards</i> about each person (in green) that include only the relevant ranges and overlap the previous one. Note also that the <i>vcards</i> are defined in such a way that does not interfere with the structural features of the table. The full linearization in OWL of this example can be found at
          <url href="http://www.essepuntato.it/2011/jasist/examples">http://www.essepuntato.it/2011/jasist/examples</url>
        </p>
      </section>
      <section xml:id="sec2-10">
        <title type="main">Improving Wiki Content Reversions</title>
        <p>EARMARK can be used to improve wiki reversion mechanisms and overcome the limitations discussed earlier: The automatic filtering and merging of contributions from multiple versions of the same page are still a <i>manual</i> process, but it can be fully automatized if the overlapping structures buried in the <i>whole</i> history of the page become explicit.</p>
        <p>The role of EARMARK is to make those structures explicit and available for more sophisticated content manipulation. To understand the extent EARMARK structures can be derived from wikis and how they can be exploited by the final users, we use as our example the wiki platform MediaWiki
          <link href="#note18">18</link>
          <note xml:id="note18">
            <p>
              <url href="http://www.mediawiki.org">http://www.mediawiki.org</url>
            </p>
          </note>
          (i.e., the wiki engine of Wikipedia).</p>
        <p>MediaWiki offers sophisticated functionalities for creating <i>diffs</i> of wiki content. Users can compare any two revisions in the page history and highlight changes in a friendly interface that shows modifications with a word‐level granularity. <i>Diff</i> pages contain metadata about each compared version (when the version was created, who the author was, or which IP address an anonymous author was connected from, etc.) and a two‐column table showing the changes side by side. Changes are detected a posteriori by comparing two arbitrary versions, which are not even required to be temporally contiguous.</p>
        <p>The output of the MediaWiki <i>diff</i> engine has regularities that can be exploited to automatically build the overlapping structures of the <i>diff</i> and to express them in EARMARK. Let us consider a fictitious example summarized in Table
          <link href="#tbl1">1</link>, where an initial text is revised three times by different authors.</p>
        <tabular xml:id="tbl1" pRights="unknown" eRights="yes" copyright="John Wiley &amp; Sons, Ltd.">
          <label>1</label>
          <title type="main">All the versions of a wiki page modified by different authors.</title>
          <table frame="topbot" colsep="0" rowsep="0" pgwide="1">
            <tgroup cols="5">
              <colspec colnum="1" colname="col1" colwidth="1*" align="left" />
              <colspec colnum="2" colname="col2" colwidth="1*" align="left" />
              <colspec colnum="3" colname="col3" colwidth="1*" align="left" />
              <colspec colnum="4" colname="col4" colwidth="1*" align="left" />
              <colspec colnum="5" colname="col5" colwidth="1*" align="left" />
              <thead valign="top">
                <row rowsep="1">
                  <entry align="left">Version Author</entry>
                  <entry align="center">V1 151.61.3.122</entry>
                  <entry align="center">V2 Angelo Di Iorio</entry>
                  <entry align="center">V3 Silvio Peroni</entry>
                  <entry align="center">V4 Fabio Vitali</entry>
                </row>
              </thead>
              <tbody>
                <row>
                  <entry>Content</entry>
                  <entry>Bob was farming carrots and tomatoes.</entry>
                  <entry>Bob was farming carrots, tomatoes <i>and beans</i>.</entry>
                  <entry>Bob was farming carrots, tomatoes and <i>green</i> beans. <i>They were all tasteful.</i></entry>
                  <entry>Bob was farming carrots, tomatoes and green beans. [<i>new paragraph</i>] They were all tasteful.</entry>
                </row>
              </tbody>
            </tgroup>
          </table>
        </tabular>
        <p>To display the differences between V1 ad V2, Mediawiki creates a page whose HTML code is as follows:
          <link href="#note19">19</link>
          <note xml:id="note19">
            <p>For the sake of clarity, we removed all markup irrelevant to our discussion.</p>
          </note>
        </p>
        <p>
          <displayedItem xml:id="di-ueqn-21" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq021" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>This is an HTML table of two rows, the first showing metadata (date and author of the modification), and the second showing the actual modifications. The first cell of the second row contains all the unmodified text and a <i>del</i> element for each inline fragment that was deleted. The second cell contains all the unmodified text and an <i>ins</i> element for each inline fragment that was inserted. Thus, these cells share <i>exactly</i> the same unmodified part(s) of the two compared versions.</p>
        <p>When the structure itself is modified rather than merely the text, the source code of the MediaWiki <i>diff</i> is slightly different. Thus, the <i>diff</i> between V3 and V4 (which splits a paragraph in two) is as follows:</p>
        <p>
          <displayedItem xml:id="di-ueqn-22" type="mathematics" numbered="no">
            <mediaResourceGroup>
              <mediaResource alt="equation image" href="urn:x-wiley:15322882:media:ASI21591:ueq023" />
              <mediaResource alt="equation image" rendition="webOriginal" mimeType="image/gif" href="" />
            </mediaResourceGroup>
          </displayedItem>
        </p>
        <p>The <i>diff</i> output is not complete or sophisticated, and of course, it is a completely different task to replan such an algorithm (but for a first idea of natural changes in <i>diffing</i> XML documents, see Di Iorio, Marchetti, Schirinzi, &amp; Vitali,
          <link href="#bib10">2009</link>). Thus, limitations of that algorithm are inevitably shared by any EARMARK representation. Yet, this output is sufficiently rich to allow us to extract the overlapping information we need. For instance, the insertion of a nonbreaking space or a carriage return generates rows according to specific rules that can be easily detected to capture the actual change by the author.</p>
        <p>Figure
          <link href="#fig5">5</link> shows the aforementioned example rebuilt in EARMARK. All versions are encoded in the same document by creating overlapping assertions over the docuverses. Metadata and RDF statements are layered on top of those assertions and create a rich knowledge‐base about the history of the documents and, in particular, about the history of each fragment.</p>
        <figure xml:id="fig5">
          <label>5</label>
          <mediaResourceGroup>
            <mediaResource alt="image" eRights="yes" copyright="Wiley Periodicals, Inc." href="urn:x-wiley:15322882:media:ASI21591:fig005" />
            <mediaResource alt="thumbnail image" rendition="webLoRes" mimeType="image/gif" href="" />
            <mediaResource alt="original image" rendition="webOriginal" mimeType="image/jpeg" href="" />
            <mediaResource alt="magnified image" rendition="webHiRes" mimeType="image/jpeg" href="" />
          </mediaResourceGroup>
          <caption>
            <p>The wiki sample versions encoded in a single EARMARK document. [Color figure can be viewed in the online issue, which is available at
              <url href="http://wileyonlinelibrary.com">wileyonlinelibrary.com</url>.]</p>
          </caption>
        </figure>
        <p>Due to the complexity of the example, we labeled arrows with numbers that indicate the position of each range within each markup item. Consider, for instance, Version <i>V4</i>: It is composed of two DIV elements, the first one containing the concatenation of “Bob was farming carrots” + “,” + “tomatoes” + “and” + “green” + “beans” + “.” and the second one containing the string “They are all tasteful.”</p>
        <p>Implementing a wiki content‐filtering mechanism on top of such a structure is rather simple. For instance, the removal of all the contributions of “Angelo Di Iorio” that leaves untouched all the content written (previously and subsequently) by “Silvio Peroni” and “Fabio Vitali” can be performed straightforwardly. Three steps are enough to apply such an intermediate content reversion:
          <list xml:id="l10" style="bulleted">
            <listItem>
              <p>the identification of the fragments written by “Angelo Di Iorio,” which is a straightforward SPARQL query on the embedded statements;</p>
            </listItem>
            <listItem>
              <p>the creation of a new version where references to those fragments are removed and references to fragments no longer in the document are correctly fixed;</p>
            </listItem>
            <listItem>
              <p>the translation of that document into an actual MediaWiki page through the serialization process described in Peroni and Vitali (
                <link href="#bib27">2009</link>).</p>
            </listItem>
          </list>
        </p>
        <p>Of course, an automatic process may generate ambiguities or even errors in the resulting content (i.e., some parts may become dangling, wrong, or unclear after removing text fragments elsewhere); grammar discrepancies also might be generated by the same approach. Linguistic and semantic problems, however, become a problem once the technical issues of managing independent, yet successive, edits are solved. What is important is that all the information about overlaps and dependencies among fragments is available in EARMARK and can easily be searched, filtered, and manipulated. Besides, foreseeing a manual intervention for checking and polishing automatically filtered content is perfectly in line with the wiki philosophy, so that the wiki community itself can wisely use the reversion tools to revise the content and adjust any intervening minor nuisances or imperfections. Such checks would still be far simpler and faster than would the manual process of partially reverting versions as we have today.</p>
      </section>
    </section>
    <section xml:id="sec1-7">
      <title type="main">Generating EARMARK From Existing Documents: The ROCCO Approach</title>
      <p>Since we do not expect documents to be natively written in EARMARK or manually created by users, we need a way to extract EARMARK data structures from existing XML‐based resources, which is trivial when the XML is simple and clearly hierarchical and slightly more complex when the XML contains workarounds to force an intrinsically overlapping situation into a single hierarchy.</p>
      <p>We designed a reliable process to transform XML files into EARMARK documents that fully captures overlapping structures even when the overlaps are hidden in one the many well‐known workarounds. This approach takes as input an XML file and produces the corresponding EARMARK document in five steps: <i>Read, Overhaul, Convert, Classify</i>, and <i>Organize</i> (hence, the name ROCCO).</p>
      <p>Since ROCCO is not the main topic of this article, we very briefly discuss the issue of converting XML into EARMARK, explaining how each step works. The ROCCO algorithm performs five steps, described next.</p>
      <section xml:id="sec2-11">
        <title type="main">Read and Overhaul</title>
        <p>The first two steps consist of loading the XML source file and, if needed, adding information useful for further processing. In EARMARK, there is a clear distinction between the textual content of a document and the structures built on top of it: The content is stored as plain text—within docuverses—and all structures are externalized and expressed through OWL and RDF assertions.</p>
        <p>While OpenOffice stores all overlapping structures in the main document file, some other editors (e.g., MS Word) store overlaps in many different ways, even in a separate file. The <i>overhaul</i> step extracts such data and adds them to the main content document by exploiting format‐specific procedures, implemented via XSLT in most cases.</p>
      </section>
      <section xml:id="sec2-12">
        <title type="main">Convert</title>
        <p>The subsequent step consists of converting the XML source file into an early EARMARK document that expresses <i>exactly</i> the same information and hierarchies. No interpretation or disentanglement of workarounds is performed at this step.</p>
        <p>Since the input is XML, this translation can be performed directly via a generic XSLT stylesheet. It basically consists of a recursive algorithm that parses the source file and generates the corresponding instances in the EARMARK ontology. Such a translation is straightforward and not difficult.</p>
      </section>
      <section xml:id="sec2-13">
        <title type="main">Classify</title>
        <p>The “Classify” step extends the EARMARK document built so far with information about the workarounds used to encode overlaps. That information will be exploited in the subsequent steps to make those overlaps explicit.</p>
        <p>The basic idea is to exploit OWL reasoners to detect workarounds in an early EARMARK document <i>D</i> by:
          <list xml:id="l11" style="bulleted">
            <listItem>
              <p>defining an ontology <i>O</i> that models all the workarounds used by applications, such as milestones, stand‐off markup, etc.; these workarounds are specific to the data format used in the source document;</p>
            </listItem>
            <listItem>
              <p>specifying the EARMARK document <i>D</i> as an ABox for the ontology <i>O</i>;</p>
            </listItem>
            <listItem>
              <p>defining SWRL rules that capture the role of each element in <i>D</i> and check relationships between elements;</p>
            </listItem>
            <listItem>
              <p>running an OWL reasoner, such as Pellet, on <i>D</i>+<i>O</i> to create new OWL instances and properties that identify which workarounds are present.</p>
            </listItem>
          </list>
        </p>
        <p>The actual detection of workarounds is delegated to an external reasoner. Refining detection strategies and even adding new strategies for new formats all can be done via OWL and SPARQL. Indeed, tricky issues need to be addressed—mostly depending on the idiosyncrasies of the original formats—but no procedural code is required.</p>
      </section>
      <section xml:id="sec2-14">
        <title type="main">Organize</title>
        <p>The final step consists of building yet another EARMARK document that expresses the overlaps and metadata <i>in an explicit way</i>, based on the information collected by the previous steps. This phase consists of mapping operations from the native format into the EARMARK structure. Such conversion relies on the identification of metadata to classify the operations and to externalize relevant metadata in separate RDF statements.</p>
      </section>
    </section>
    <section xml:id="sec1-8">
      <title type="main">Evaluating EARMARK</title>
      <p>One of the most frequent criticisms when proposing a different approach to solving a well‐known problem in information and communication technology is that the new solution may simplify the difficulties of the specific problem, but brings with it hidden costs in terms of size of the data structure, computation efforts, or conversion restrictions that compensate the advantages. In our case, one of the anonymous reviewers of our article (Di Iorio, Peroni, &amp; Vitali,
        <link href="#bib10">2009</link>) wondered whether a difference in file size could weigh in on the convenience of adopting EARMARK as opposed to working with the original files.</p>
      <p>As such, a discussion of cost functions of EARMARK versus other formats is in order. Yet, a systematic discussion of the relative costs (e.g., in byte size) of some original XML‐based data structures versus their EARMARK equivalent is an open‐ended undertaking that heavily depends on the original XML data structure and the specific features present in the document, and is badly defined anyway: While XML is a linearization format immediately expressible in actual bytes, OWL (or more precisely, RDF, the language in which OWL ontologies are expressed) is an abstract structure that allows a large number of linearization formats (including XML itself) with corresponding huge differences in the final byte counts.</p>
      <p>For these reasons, to provide at least an initial test of meaningful concepts, we selected two XML‐based data formats (OOXML and ODT) and, specifically, a set of documents where overlapping tricks were present (i.e., where change‐tracking was active). To bypass the size discussion, we decided not to test byte lengths (which are not meaningful and easily skewed, e.g., by reducing the string length of the element names or of the class names) but the number of nodes for XML documents and of triples for OWL documents. This comparison again is not particularly appropriate (Triples are naturally numerous in OWL ontologies, and it is customary to deal with hundreds of thousands and even millions of assertions in Semantic Web applications.) but closer to meaningfulness than is the mere byte count.</p>
      <p>Our comparison was carried on a small set of documents in ODT and OOXML that included change‐tracking information. As discussed in the previous sections, change‐tracking facilities generate rather complex overlaps even for basic operations on small text fragments, which in turn are expressed as a potentially huge number of standoffs and milestone markup within the XML hierarchy. The same documents were individually converted into EARMARK. We then charted how simple edits under change‐tracking affect the number of nodes in XML formats and of statements in OWL files.
        <link href="#note20">20</link>
        <note xml:id="note20">
          <p>The full details about each version and each format also are available at
            <url href="http://www.essepuntato.it/2011/jasist/discussion">http://www.essepuntato.it/2011/jasist/discussion</url>
          </p>
        </note>
      </p>
      <p>We created seven different versions, named after the “Seven Dwarfs” for recognizability, by applying very common edits (e.g., the insertion of few words, the deletion of some sentences, the split of a paragraph, etc.) on a small document, creating multiple overlaps. Figure
        <link href="#fig6">6</link> shows the results of our comparison.</p>
      <figure xml:id="fig6">
        <label>6</label>
        <mediaResourceGroup>
          <mediaResource alt="image" eRights="yes" copyright="Wiley Periodicals, Inc." href="urn:x-wiley:15322882:media:ASI21591:fig006" />
          <mediaResource alt="thumbnail image" rendition="webLoRes" mimeType="image/gif" href="" />
          <mediaResource alt="original image" rendition="webOriginal" mimeType="image/jpeg" href="" />
          <mediaResource alt="magnified image" rendition="webHiRes" mimeType="image/jpeg" href="" />
        </mediaResourceGroup>
        <caption>
          <p>A graph summarizing the results of the first experiment. [Color figure can be viewed in the online issue, which is available at
            <url href="http://wileyonlinelibrary.com">wileyonlinelibrary.com</url>.]</p>
        </caption>
      </figure>
      <p>The overall trend is interesting and comforting: While in simple documents with no overlap the node count of XML is lower than is the assertion count of EARMARK triples, the presence of overlaps makes EARMARK and XML formats comparable. The growth of EARMARK statements is in fact very close to the growth of XML nodes when the number of overlaps increases. EARMARK is even more efficient than is XML for more complex documents.</p>
      <p>The measure for each format was done by counting only those nodes and statements instrumental to encode content and (overlapping) structures: We did not take into account either the presentational information for ODT and OOXML (Each file, for instance, includes a very long list of style definitions that are not relevant for the purposes of our analysis.) or namespace declarations (OOXML files, for instance, list all relevant namespaces for the Office toolkit.) or ignorable white spaces (that are only added to indent content and improve readability).</p>
      <p>Interestingly, EARMARK and ODT show a very similar increase in size while OOXML is much more verbose and grows faster. The content of the first version, for instance, is encoded using four nodes in ODT, 13 statements in EARMARK, and 54 nodes in OOXML; the last one contains 241 ODT nodes, 233 EARMARK statements, and 452 OOXML nodes. To return to our original inquiry, it is clear that the weight of EARMARK documents is very good compared to the other ones.</p>
      <p>Also note the regularity in the growth of EARMARK statements. Regardless of the actual modifications applied to the document, in fact, EARMARK adds about 40 statements for each edit. Both OOXML and ODT, on the contrary, show a more irregular “pace.” The reason for this is that EARMARK externalizes <i>all</i> assertions, so that <i>all</i> modifications (either to leaf nodes or to intermediate nodes in the original XML) are “flattened” onto the docuverses and do not depend on the complexity of the structure within which the edit took place.</p>
      <p>Figure
        <link href="#fig7">7</link> shows the results of a similar comparison on a different set of documents and edits. We collected seven versions named after the days of the week and created by seven different authors when editing a very simple document. The overall trend does not change, and shows that EARMARK and ODT again have a comparable behavior, far better than that of OOXML.</p>
      <figure xml:id="fig7">
        <label>7</label>
        <mediaResourceGroup>
          <mediaResource alt="image" eRights="yes" copyright="Wiley Periodicals, Inc." href="urn:x-wiley:15322882:media:ASI21591:fig007" />
          <mediaResource alt="thumbnail image" rendition="webLoRes" mimeType="image/gif" href="" />
          <mediaResource alt="original image" rendition="webOriginal" mimeType="image/jpeg" href="" />
          <mediaResource alt="magnified image" rendition="webHiRes" mimeType="image/jpeg" href="" />
        </mediaResourceGroup>
        <caption>
          <p>A graph summarizing the results of the second experiment. [Color figure can be viewed in the online issue, which is available at
            <url href="http://wileyonlinelibrary.com">wileyonlinelibrary.com</url>.]</p>
        </caption>
      </figure>
      <p>In conclusion, although preliminary, this study shows clear trends of a very conservative behavior of EARMARK with respect to document size.</p>
    </section>
    <section xml:id="sec1-9">
      <title type="main">Conclusions</title>
      <p>Overlaps, far from being an obscure requirement for sophisticated functionalities of arcane markup languages, are a very frequent undertaking even in major data formats and in rather frequent situations. Yet, since the XML language does not allow them, consciously or not, designers of data formats have adopted a huge and entangled array of tricks, special cases, and workarounds that, although solving the actual problem of storing overlapping structures, open new and complicated ones when approaching even basic chores on documents containing them, such as queries.</p>
      <p>The EARMARK approach drastically reduces the efforts needed to perform such chores on overlapping structures since it does not allow the corresponding multiple trees to actually entangle and complicate the job. EARMARK is radically different from both special markup metalanguages that allow overlaps and the introduction of workarounds within the traditional tree‐oriented XML language because it treats multiple trees over the same content as first‐class citizens of the language, yet uses well‐known and standard W3C technologies and languages to perform all tasks. EARMARK documents, at the end, are OWL ontologies. Thus, any Semantic Web technology (e.g., SPARQL) can be used <i>straightforwardly</i> to perform operations on their content.</p>
      <p>Improving queries is not the only application of EARMARK. Validation is another interesting field that we are investigating. In fact, the same ontological framework can be used to prove properties concerning a document, such as validity against a schema, compliance to co‐constraint specifications, or adherence to structural patterns. Moreover, inspired by Marcoux and Rizkallah (
        <link href="#bib24">2009</link>), in which they described an approach for defining natural‐language semantics for XML‐based languages, we also are developing an ontology‐based approach for encoding <i>markup semantics</i>—that is, the formal definition of meanings of markup elements, besides the syntactical structure of a markup document—within EARMARK documents.</p>
    </section>
    <bibliography style="nameDate">
      <title type="main">References</title>
      <bib xml:id="bib1">
        <citation type="other" xml:id="cit2">
          <author>
            <familyName>Adida</familyName>,
            <givenNames>B.</givenNames>
          </author>,
          <author>
            <familyName>Birbeck</familyName>,
            <givenNames>M.</givenNames>
          </author>,
          <author>
            <familyName>McCarron</familyName>,
            <givenNames>S.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Pemberton</familyName>,
            <givenNames>S.</givenNames>
          </author>
          (
          <pubYear year="2008">2008</pubYear>). RDFa in XHTML: Syntax and processing. W3C Recommendation, October 14, 2008, World Wide Web Consortium. Retrieved from
          <url href="http://www.w3.org/TR/rdfa-syntax/">http://www.w3.org/TR/rdfa‐syntax/</url>
        </citation>
      </bib>
      <bib xml:id="bib2">
        <citation type="book" xml:id="cit3">
          <author>
            <familyName>Allsopp</familyName>,
            <givenNames>J.</givenNames>
          </author>
          (
          <pubYear year="2007">2007</pubYear>).
          <bookTitle>Microformats: Empowering your markup for Web 2.0</bookTitle>.
          <publisherLoc>New York, NY</publisherLoc>:
          <publisherName>Friends of ED Press</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib3">
        <citation type="book" xml:id="cit4">
          <author>
            <familyName>Bański</familyName>,
            <givenNames>P.</givenNames>
          </author>
          (
          <pubYear year="2010">2010</pubYear>).
          <chapterTitle>Why TEI stand‐off annotation doesn't quite work: And why you might want to use it nevertheless</chapterTitle>. In
          <bookTitle>Proceedings of Balisage: The Markup Conference 2010</bookTitle>.
          <publisherLoc>Rockville, MD</publisherLoc>:
          <publisherName>Mulberry Technologies</publisherName>. Retrieved from
          <url href="http://www.balisage.net/Proceedings/vol5/html/Banski01/BalisageVol5-Banski01.html">http://www.balisage.net/Proceedings/vol5/html/Banski01/BalisageVol5‐Banski01.html</url>
        </citation>
      </bib>
      <bib xml:id="bib4">
        <citation type="other" xml:id="cit5">
          <author>
            <familyName>Beckett</familyName>,
            <givenNames>D.</givenNames>
          </author>
          (
          <pubYear year="2004">2004</pubYear>). RDF/XML syntax specification (Rev.). W3C Recommendation, February 10, 2004, World Wide Web Consortium. Retrieved from
          <url href="http://www.w3.org/TR/2004/REC-rdf-syntax-grammar-20040210/">http://www.w3.org/TR/2004/REC‐rdf‐syntax‐grammar‐20040210/</url>
        </citation>
      </bib>
      <bib xml:id="bib5">
        <citation type="other" xml:id="cit6">
          <author>
            <familyName>Berglund</familyName>,
            <givenNames>A.</givenNames>
          </author>,
          <author>
            <familyName>Boag</familyName>,
            <givenNames>S.</givenNames>
          </author>,
          <author>
            <familyName>Chamberlin</familyName>,
            <givenNames>D.</givenNames>
          </author>,
          <author>
            <familyName>Fernández</familyName>,
            <givenNames>M.F.</givenNames>
          </author>,
          <author>
            <familyName>Kay</familyName>,
            <givenNames>M.</givenNames>
          </author>,
          <author>
            <familyName>Robie</familyName>,
            <givenNames>J.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Siméon</familyName>,
            <givenNames>J.</givenNames>
          </author>
          (
          <pubYear year="2007">2007</pubYear>). XML Path Language (XPath) 2.0. W3C Recommendation, January 23, 2007, World Wide Web Consortium. Retrieved from
          <url href="http://www.w3.org/TR/xpath20/">http://www.w3.org/TR/xpath20/</url>
        </citation>
      </bib>
      <bib xml:id="bib6">
        <citation type="other" xml:id="cit7">
          <author>
            <familyName>Brickley</familyName>,
            <givenNames>D.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Guha</familyName>,
            <givenNames>R.V.</givenNames>
          </author>
          (
          <pubYear year="2004">2004</pubYear>). RDF Vocabulary Description Language 1.0: RDF Schema. W3C Recommendation, February 10, 2004, World Wide Web Consortium. Retrieved from
          <url href="http://www.w3.org/TR/rdf-schema/">http://www.w3.org/TR/rdf‐schema/</url>
        </citation>
      </bib>
      <bib xml:id="bib7">
        <citation type="journal" xml:id="cit8">
          <author>
            <familyName>Ciccarese</familyName>,
            <givenNames>P.</givenNames>
          </author>,
          <author>
            <familyName>Wu</familyName>,
            <givenNames>E.</givenNames>
          </author>,
          <author>
            <familyName>Kinoshita</familyName>,
            <givenNames>J.</givenNames>
          </author>,
          <author>
            <familyName>Wong</familyName>,
            <givenNames>G.</givenNames>
          </author>,
          <author>
            <familyName>Ocana</familyName>,
            <givenNames>M.</givenNames>
          </author>,
          <author>
            <familyName>Ruttenberg</familyName>,
            <givenNames>A.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Clark</familyName>,
            <givenNames>T.</givenNames>
          </author>
          (
          <pubYear year="2008">2008</pubYear>).
          <articleTitle>The SWAN biomedical discourse ontology</articleTitle>.
          <journalTitle>Journal of Biomedical Informatics</journalTitle>,
          <vol>41</vol>(
          <issue>5</issue>),
          <pageFirst>739</pageFirst>–
          <pageLast>751</pageLast>.
        </citation>
      </bib>
      <bib xml:id="bib8">
        <citation type="book" xml:id="cit9">
          <author>
            <familyName>DeRose</familyName>,
            <givenNames>S.</givenNames>
          </author> (
          <pubYear year="2004">2004</pubYear>).
          <chapterTitle>Markup overlap: A review and a horse</chapterTitle>. In
          <bookTitle>Proceedings of the Extreme Markup Languages 2004</bookTitle>.
          <publisherLoc>Rockville, MD</publisherLoc>:
          <publisherName>Mulberry Technologies</publisherName>. Retrieved from
          <url href="http://conferences.idealliance.org/extreme/html/2004/DeRose01/EML2004DeRose01.html">http://conferences.idealliance.org/extreme/html/2004/DeRose01/EML2004DeRose01.html</url>
        </citation>
      </bib>
      <bib xml:id="bib9">
        <citation type="book" xml:id="cit10">
          <author>
            <familyName>Di Iorio</familyName>,
            <givenNames>A.</givenNames>
          </author>,
          <author>
            <familyName>Marchetti</familyName>,
            <givenNames>C.</givenNames>
          </author>,
          <author>
            <familyName>Schirinzi</familyName>,
            <givenNames>M.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Vitali</familyName>,
            <givenNames>F.</givenNames>
          </author>
          (
          <pubYear year="2009">2009</pubYear>).
          <chapterTitle>Natural and multi‐layered approach to detect changes in tree‐based textual documents</chapterTitle>. In
          <editor>
            <givenNames>J.</givenNames>
            <familyName>Cordeiro</familyName>
          </editor> &amp;
          <editor>
            <givenNames>J.</givenNames>
            <familyName>Filipe</familyName>
          </editor> (Eds.),
          <bookTitle>Proceedings of the 11th International Conference on Enterprise Information Systems (ICEIS 2009)</bookTitle> (pp.
          <pageFirst>90</pageFirst>–
          <pageLast>101</pageLast>).
          <publisherLoc>Heidelberg, Germany</publisherLoc>:
          <publisherName>Springer</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib10">
        <citation type="book" xml:id="cit11">
          <author>
            <familyName>Di Iorio</familyName>,
            <givenNames>A.</givenNames>
          </author>,
          <author>
            <familyName>Peroni</familyName>,
            <givenNames>S.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Vitali</familyName>,
            <givenNames>F.</givenNames>
          </author> (
          <pubYear year="2009">2009</pubYear>).
          <chapterTitle>Towards markup support for full GODDAGs and beyond: The EARMARK approach</chapterTitle>. In
          <bookTitle>Proceedings of Balisage: The Markup Conference 2009</bookTitle>.
          <publisherLoc>Rockville, MD</publisherLoc>:
          <publisherName>Mulberry Technologies</publisherName>. Retrieved from
          <url href="http://balisage.net/Proceedings/vol3/html/Peroni01/BalisageVol3-Peroni01.html">http://balisage.net/Proceedings/vol3/html/Peroni01/BalisageVol3‐Peroni01.html</url>
        </citation>
      </bib>
      <bib xml:id="bib11">
        <citation type="book" xml:id="cit12">
          <author>
            <familyName>Di Iorio</familyName>,
            <givenNames>A.</givenNames>
          </author>,
          <author>
            <familyName>Peroni</familyName>,
            <givenNames>S.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Vitali</familyName>,
            <givenNames>F.</givenNames>
          </author>
          (
          <pubYear year="2010">2010</pubYear>).
          <chapterTitle>Handling markup overlaps using OWL</chapterTitle>. In
          <editor>
            <givenNames>P.</givenNames>
            <familyName>Cimiano</familyName>
          </editor> &amp;
          <editor>
            <givenNames>H. S.</givenNames>
            <familyName>Pinto</familyName>
          </editor> (Eds.),
          <bookTitle>Proceedings of the 17th International Conference on Knowledge Engineering and Knowledge Management (EKAW 2010)</bookTitle> (pp.
          <pageFirst>391</pageFirst>–
          <pageLast>400</pageLast>).
          <publisherLoc>Heidelberg, Germany</publisherLoc>:
          <publisherName>Springer</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib12">
        <citation type="journal" xml:id="cit13">
          <author>
            <familyName>Di Iorio</familyName>,
            <givenNames>A.</givenNames>
          </author>,
          <author>
            <familyName>Peroni</familyName>,
            <givenNames>S.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Vitali</familyName>,
            <givenNames>F.</givenNames>
          </author> (in press).
          <articleTitle>Using Semantic Web technologies for analysis and validation of structural markup</articleTitle>.
          <journalTitle>International Journal of Web Engineering and Technology</journalTitle>.
        </citation>
      </bib>
      <bib xml:id="bib13">
        <citation type="book" xml:id="cit14">
          <author>
            <familyName>Drummond</familyName>,
            <givenNames>N.</givenNames>
          </author>,
          <author>
            <familyName>Rector</familyName>,
            <givenNames>A.</givenNames>
          </author>,
          <author>
            <familyName>Stevens</familyName>,
            <givenNames>R.</givenNames>
          </author>,
          <author>
            <familyName>Moulton</familyName>,
            <givenNames>G.</givenNames>
          </author>,
          <author>
            <familyName>Horridge</familyName>,
            <givenNames>M.</givenNames>
          </author>,
          <author>
            <familyName>Wang</familyName>,
            <givenNames>H.H.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Seidenberg</familyName>,
            <givenNames>J.</givenNames>
          </author>
          (
          <pubYear year="2006">2006</pubYear>).
          <chapterTitle>Putting OWL in order: Patterns for sequences in OWL</chapterTitle>. In
          <editor>
            <givenNames>B. C.</givenNames>
            <familyName>Grau</familyName>
          </editor>,
          <editor>
            <givenNames>P.</givenNames>
            <familyName>Hitzler</familyName>
          </editor>,
          <editor>
            <givenNames>C.</givenNames>
            <familyName>Shankey</familyName>
          </editor>, &amp;
          <editor>
            <givenNames>E.</givenNames>
            <familyName>Wallace</familyName>
          </editor> (Eds.),
          <bookTitle>Proceedings of the Workshop on OWL: Experiences and Directions (OWLED 2006)</bookTitle>,
          <publisherName>Athens, GA</publisherName>. Retrieved from
          <url href="http://sunsite.informatik.rwth-aachen.de/Publications/CEUR-WS/Vol-216/submission_12.pdf">http://sunsite.informatik.rwth‐aachen.de/Publications/CEUR‐WS/Vol‐216/submission_12.pdf</url>
        </citation>
      </bib>
      <bib xml:id="bib14">
        <citation type="other" xml:id="cit15">
          <author>
            <familyName>Durand</familyName>,
            <givenNames>D.G.</givenNames>
          </author>
          (
          <pubYear year="1994">1994, October</pubYear>).
          <otherTitle>Palimpsest, a data model for revision control</otherTitle>. Paper presented at the Workshop on Collaborative Editing Systems at the Computer Supported Cooperative Work Conference (CSCW94), Chapel Hill, NC.
        </citation>
      </bib>
      <bib xml:id="bib15">
        <citation type="book" xml:id="cit16">
          <author>
            <familyName>Durand</familyName>,
            <givenNames>D.G.</givenNames>
          </author> (
          <pubYear year="2008">2008</pubYear>).
          <bookTitle>Palimpsest: Change‐oriented concurrency control for the support of collaborative applications</bookTitle>.
          <publisherLoc>Charleston, SC</publisherLoc>:
          <publisherName>CreateSpace</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib16">
        <citation type="other" xml:id="cit17">
          <author>
            <familyName>Garlik</familyName>,
            <givenNames>S.H.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Seaborne</familyName>,
            <givenNames>A.</givenNames>
          </author>
          (
          <pubYear year="2010">2010</pubYear>). SPARQL 1.1 Query Language. W3C Working Draft, October 14, 2010, World Wide Web Consortium. Retrieved from
          <url href="http://www.w3.org/TR/sparql11-query/">http://www.w3.org/TR/sparql11‐query/</url>
        </citation>
      </bib>
      <bib xml:id="bib17">
        <citation type="book" xml:id="cit18">
          <author>
            <familyName>Georg</familyName>,
            <givenNames>R.</givenNames>
          </author>,
          <author>
            <familyName>Schonefeld</familyName>,
            <givenNames>O.</givenNames>
          </author>,
          <author>
            <familyName>Trippel</familyName>,
            <givenNames>T.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Witt</familyName>,
            <givenNames>A.</givenNames>
          </author>
          (
          <pubYear year="2010">2010</pubYear>).
          <chapterTitle>Sustainability of linguistic resources revisited</chapterTitle>. In
          <bookTitle>Proceedings of the International Symposium on XML for the Long Haul: Issues in the Long‐Term Preservation of XML</bookTitle>.
          <publisherLoc>Rockville, MD</publisherLoc>:
          <publisherName>Mulberry Technologies</publisherName>. Retrieved from
          <url href="http://www.balisage.net/Proceedings/vol6/html/Witt01/BalisageVol6-Witt01.html">http://www.balisage.net/Proceedings/vol6/html/Witt01/BalisageVol6‐Witt01.html</url>
        </citation>
      </bib>
      <bib xml:id="bib18">
        <citation type="book" xml:id="cit19">
          <author>
            <familyName>Goldfarb</familyName>,
            <givenNames>C.F.</givenNames>
          </author> (
          <pubYear year="1990">1990</pubYear>).
          <bookTitle>The SGML Handbook</bookTitle>.
          <publisherLoc>New York, NY</publisherLoc>:
          <publisherName>Oxford University Press</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib19">
        <citation type="other" xml:id="cit20">
          <author>
            <familyName>Horridge</familyName>,
            <givenNames>M.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Patel‐Schneider</familyName>,
            <givenNames>P.</givenNames>
          </author>
          (
          <pubYear year="2009">2009</pubYear>). OWL 2 Web Ontology Language: Manchester Syntax. W3C Working Group Note October 27, 2009, World Wide Web Consortium. Retrieved from
          <url href="http://www.w3.org/TR/owl2-manchester-syntax/">http://www.w3.org/TR/owl2‐manchester‐syntax/</url>
        </citation>
      </bib>
      <bib xml:id="bib20">
        <citation type="other" xml:id="cit21">
          <author>
            <familyName>Horrocks</familyName>,
            <givenNames>I.</givenNames>
          </author>,
          <author>
            <familyName>Patel‐Schneider</familyName>,
            <givenNames>P.F.</givenNames>
          </author>,
          <author>
            <familyName>Boley</familyName>,
            <givenNames>H.</givenNames>
          </author>,
          <author>
            <familyName>Tabet</familyName>,
            <givenNames>S.</givenNames>
          </author>,
          <author>
            <familyName>Grosof</familyName>,
            <givenNames>B.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Dean</familyName>,
            <givenNames>M.</givenNames>
          </author>
          (
          <pubYear year="2004">2004</pubYear>). SWRL: A Semantic Web rule language combining OWL and RuleML. W3C Member Submission, May 21, 2004, World Wide Web Consortium. Retrieved
          <url href="http://www.w3.org/Submission/SWRL/">http://www.w3.org/Submission/SWRL/</url>
        </citation>
      </bib>
      <bib xml:id="bib21">
        <citation type="other" xml:id="cit22">
          <author>
            <familyName>Huitfeldt</familyName>,
            <givenNames>C.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Sperberg‐McQueen</familyName>,
            <givenNames>C.M.</givenNames>
          </author>
          (
          <pubYear year="2003">2003</pubYear>). TexMECS: An experimental markup meta‐language for complex documents. Retrieved from
          <url href="http://decentius.aksis.uib.no/mlcd/2003/Papers/texmecs.html">http://decentius.aksis.uib.no/mlcd/2003/Papers/texmecs.html</url>
        </citation>
      </bib>
      <bib xml:id="bib22">
        <citation type="book" xml:id="cit23">
          <groupName>JTC1/SC34 WG 4</groupName>. (
          <pubYear year="2008">2008</pubYear>). ISO/IEC 29500‐1:2008—Information technology—Document description and processing languages—Office Open XML File Formats: Part 1.
          <bookTitle>Fundamentals and markup language reference</bookTitle>.
          <publisherLoc>Geneva, Switzerland</publisherLoc>:
          <publisherName>International Organization for Standardization</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib23">
        <citation type="book" xml:id="cit24">
          <groupName>JTC1/SC34 WG 6</groupName>. (
          <pubYear year="2006">2006</pubYear>).
          <bookTitle>ISO/IEC 26300:2006—Information technology—Open document format for office applications (OpenDocument), Version 1.0</bookTitle>.
          <publisherLoc>Geneva, Switzerland</publisherLoc>:
          <publisherName>International Organization for Standardization</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib24">
        <citation type="journal" xml:id="cit25">
          <author>
            <familyName>Marcoux</familyName>,
            <givenNames>Y.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Rizkallah</familyName>,
            <givenNames>E.</givenNames>
          </author>
          (
          <pubYear year="2009">2009</pubYear>).
          <articleTitle>Intertextual semantics: A semantics for information design</articleTitle>.
          <journalTitle>Journal of the American Society for Information Science and Technology</journalTitle>,
          <vol>60</vol>(
          <issue>9</issue>),
          <pageFirst>1895</pageFirst>–
          <pageLast>1906</pageLast>.
        </citation>
      </bib>
      <bib xml:id="bib25">
        <citation type="journal" xml:id="cit26">
          <author>
            <familyName>Marinelli</familyName>,
            <givenNames>P.</givenNames>
          </author>,
          <author>
            <familyName>Vitali</familyName>,
            <givenNames>F.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Zacchiroli</familyName>,
            <givenNames>S.</givenNames>
          </author>
          (
          <pubYear year="2008">2008</pubYear>).
          <articleTitle>Towards the unification of formats for overlapping markup</articleTitle>.
          <journalTitle>New Review of Hypermedia and Multimedia</journalTitle>,
          <vol>14</vol>(
          <issue>1</issue>),
          <pageFirst>57</pageFirst>–
          <pageLast>94</pageLast>.
        </citation>
      </bib>
      <bib xml:id="bib26">
        <citation type="book" xml:id="cit27">
          <author>
            <familyName>Nelson</familyName>,
            <givenNames>T.</givenNames>
          </author> (
          <pubYear year="1980">1980</pubYear>).
          <bookTitle>Literary machines: The report on, and of, Project Xanadu concerning word processing, electronic publishing, hypertext, thinkertoys, tomorrow's intellectual ⋖ including knowledge, education and freedom</bookTitle>.
          <publisherLoc>Sausalito, CA</publisherLoc>:
          <publisherName>Mindful Press</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib27">
        <citation type="book" xml:id="cit28">
          <author>
            <familyName>Peroni</familyName>,
            <givenNames>S.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Vitali</familyName>,
            <givenNames>F.</givenNames>
          </author>
          (
          <pubYear year="2009">2009</pubYear>).
          <chapterTitle>Annotations with EARMARK for arbitrary, overlapping and out‐of order markup</chapterTitle>. In
          <editor>
            <givenNames>U.M.</givenNames>
            <familyName>Borghoff</familyName>
          </editor> &amp;
          <editor>
            <givenNames>B.</givenNames>
            <familyName>Chidlovskii</familyName>
          </editor> (Eds.),
          <bookTitle>Proceedings of the 2009 ACM Symposium on Document Engineering (DocEng 2009)</bookTitle> (pp.
          <pageFirst>171</pageFirst>–
          <pageLast>180</pageLast>).
          <publisherLoc>New York, NY</publisherLoc>:
          <publisherName>ACM</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib28">
        <citation type="book" xml:id="cit29">
          <author>
            <familyName>Portier</familyName>,
            <givenNames>P.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Calabretto</familyName>,
            <givenNames>S.</givenNames>
          </author>
          (
          <pubYear year="2009">2009</pubYear>).
          <chapterTitle>Methodology for the construction of multi‐structured documents</chapterTitle>. In
          <bookTitle>Proceedings of Balisage: The Markup Conference 2009</bookTitle>.
          <publisherLoc>Rockville, MD</publisherLoc>:
          <publisherName>Mulberry Technologies</publisherName>. Retrieved from
          <url href="http://balisage.net/Proceedings/vol3/html/Portier01/BalisageVol3-Portier01.html">http://balisage.net/Proceedings/vol3/html/Portier01/BalisageVol3‐Portier01.html</url>
        </citation>
      </bib>
      <bib xml:id="bib29">
        <citation type="journal" xml:id="cit30">
          <author>
            <familyName>Riggs</familyName>,
            <givenNames>K.R.</givenNames>
          </author>
          (
          <pubYear year="2002">2002</pubYear>).
          <articleTitle>XML and free text</articleTitle>.
          <journalTitle>Journal of the American Society for Information Science and Technology</journalTitle>,
          <vol>53</vol>(
          <issue>6</issue>),
          <pageFirst>526</pageFirst>–
          <pageLast>528</pageLast>.
        </citation>
      </bib>
      <bib xml:id="bib30">
        <citation type="journal" xml:id="cit31">
          <author>
            <familyName>Salembier</familyName>,
            <givenNames>P.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Benitez</familyName>,
            <givenNames>A.B.</givenNames>
          </author>
          (
          <pubYear year="2007">2007</pubYear>).
          <articleTitle>Structure description tools</articleTitle>.
          <journalTitle>Journal of the American Society for Information Science and Technology</journalTitle>,
          <vol>58</vol>(
          <issue>9</issue>),
          <pageFirst>1329</pageFirst>–
          <pageLast>1337</pageLast>.
        </citation>
      </bib>
      <bib xml:id="bib31">
        <citation type="book" xml:id="cit32">
          <author>
            <familyName>Schmidt</familyName>,
            <givenNames>D.</givenNames>
          </author>
          (
          <pubYear year="2009">2009</pubYear>).
          <chapterTitle>Merging multi‐version texts: A generic solution to the overlap problem</chapterTitle>. In
          <bookTitle>Proceedings of Balisage: The Markup Conference 2009</bookTitle>.
          <publisherLoc>Rockville, MD</publisherLoc>:
          <publisherName>Mulberry Technologies</publisherName>. Retrieved from
          <url href="http://balisage.net/Proceedings/vol3/html/Schmidt01/BalisageVol3-Schmidt01.html">http://balisage.net/Proceedings/vol3/html/Schmidt01/BalisageVol3‐Schmidt01.html</url>
        </citation>
      </bib>
      <bib xml:id="bib32">
        <citation type="journal" xml:id="cit33">
          <author>
            <familyName>Schmidt</familyName>,
            <givenNames>D.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Colomb</familyName>,
            <givenNames>R.</givenNames>
          </author>
          (
          <pubYear year="2009">2009</pubYear>).
          <articleTitle>A data structure for representing multi‐version texts online</articleTitle>.
          <journalTitle>Journal of Human–Computer Studies</journalTitle>,
          <vol>67</vol>(
          <issue>6</issue>),
          <pageFirst>497</pageFirst>–
          <pageLast>514</pageLast>.
        </citation>
      </bib>
      <bib xml:id="bib33">
        <citation type="book" xml:id="cit34">
          <author>
            <familyName>Schonefeld</familyName>,
            <givenNames>O.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Witt</familyName>,
            <givenNames>A.</givenNames>
          </author> (
          <pubYear year="2006">2006</pubYear>).
          <chapterTitle>Towards validation of concurrent markup</chapterTitle>. In
          <bookTitle>Proceedings of the Extreme Markup Languages 2006</bookTitle>.
          <publisherLoc>Rockville, MD</publisherLoc>:
          <publisherName>Mulberry Technologies</publisherName>. Retrieved from h
          <url href="http://conferences.idealliance.org/extreme/html/2006/Schonefeld01/EML2006Schonefeld01.html">ttp://conferences.idealliance.org/extreme/html/2006/Schonefeld01/EML2006Schonefeld01.html</url>
        </citation>
      </bib>
      <bib xml:id="bib34">
        <citation type="book" xml:id="cit35">
          <author>
            <familyName>Sperberg‐McQueen</familyName>,
            <givenNames>C.M.</givenNames>
          </author>
          (
          <pubYear year="2006">2006</pubYear>).
          <chapterTitle>Rabbit/duck grammars: A validation method for overlapping structures</chapterTitle>. In
          <bookTitle>Proceedings of Extreme Markup Languages Conference 2006</bookTitle>.
          <publisherLoc>Rockville, MD</publisherLoc>:
          <publisherName>Mulberry Technologies</publisherName>. Retrieved from
          <url href="http://conferences.idealliance.org/extreme/html/2006/SperbergMcQueen01/EML2006SperbergMcQueen01.html">http://conferences.idealliance.org/extreme/html/2006/SperbergMcQueen01/EML2006SperbergMcQueen01.html</url>
        </citation>
      </bib>
      <bib xml:id="bib35">
        <citation type="book" xml:id="cit36">
          <author>
            <familyName>Sperberg‐McQueen</familyName>,
            <givenNames>C.M.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Huitfeldt</familyName>,
            <givenNames>C.</givenNames>
          </author>
          (
          <pubYear year="2004">2004</pubYear>).
          <chapterTitle>GODDAG: A data structure for overlapping hierarchies</chapterTitle>. In
          <editor>
            <givenNames>P.R.</givenNames>
            <familyName>King</familyName>
          </editor> &amp;
          <editor>
            <givenNames>E.V.</givenNames>
            <familyName>Munson</familyName>
          </editor> (Eds.),
          <bookTitle>Proceeding of the 5th International Workshop on the Principles of Digital Document Processing (PODDP 2000)</bookTitle> (pp.
          <pageFirst>139</pageFirst>–
          <pageLast>160</pageLast>).
          <publisherLoc>Heidelberg, Germany</publisherLoc>:
          <publisherName>Springer</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib36">
        <citation type="other" xml:id="cit37">
          <groupName>TEI Consortium</groupName>. (
          <pubYear year="2005">2005</pubYear>). TEI P5: Guidelines for electronic text encoding and interchange. Retrieved from
          <url href="http://www.tei-c.org/Guidelines/P5">http://www.tei‐c.org/Guidelines/P5</url>
        </citation>
      </bib>
      <bib xml:id="bib37">
        <citation type="other" xml:id="cit38">
          <author>
            <familyName>Tennison</familyName>,
            <givenNames>J.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Piez</familyName>,
            <givenNames>W.</givenNames>
          </author>
          (
          <pubYear year="2002">2002, August</pubYear>). The Layered Markup and Annotation Language (LMNL). Paper resented at the Extreme Markup Languages Conference 2002, Montreal, Canada.
        </citation>
      </bib>
      <bib xml:id="bib38">
        <citation type="book" xml:id="cit39">
          <author>
            <familyName>Tummarello</familyName>,
            <givenNames>G.</givenNames>
          </author>,
          <author>
            <familyName>Morbidoni</familyName>,
            <givenNames>C.</givenNames>
          </author>, &amp;
          <author>
            <familyName>Pierazzo</familyName>,
            <givenNames>E.</givenNames>
          </author>
          (
          <pubYear year="2005">2005</pubYear>).
          <chapterTitle>Toward textual encoding based on RDF</chapterTitle>. In
          <editor>
            <givenNames>M.</givenNames>
            <familyName>Dobreva</familyName>
          </editor> &amp;
          <editor>
            <givenNames>J.</givenNames>
            <familyName>Engelen</familyName>
          </editor> (Eds.),
          <bookTitle>Proceedings of the Ninth ICCC International Conference on Electronic Publishing (ELPUB2005)</bookTitle>.
          <publisherLoc>Leuven, Belgium</publisherLoc>:
          <publisherName>Peeters</publisherName>.
        </citation>
      </bib>
      <bib xml:id="bib39">
        <citation type="other" xml:id="cit40">
          <groupName>W3C OWL Working Group</groupName>. (
          <pubYear year="2009">2009</pubYear>). OWL 2 web ontology language document overview. W3C Recommendation, October 27, 2009, World Wide Web Consortium. Retrieved from
          <url href="http://www.w3.org/TR/owl2-overview/">http://www.w3.org/TR/owl2‐overview/</url>
        </citation>
      </bib>
    </bibliography>
  </body>
</component>