diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b3fc27c --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +0?-*/ diff --git a/dtd/Wileyml3gv20-flat/Wileyml3gv20-flat.dtd b/dtd/Wileyml3gv20-flat/Wileyml3gv20-flat.dtd new file mode 100644 index 0000000..85bb9f7 --- /dev/null +++ b/dtd/Wileyml3gv20-flat/Wileyml3gv20-flat.dtddiff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-bib.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-bib.rng new file mode 100644 index 0000000..6bba3d1 --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-bib.rng @@ -0,0 +1,570 @@ + + + + + + + + + + + + + + + + whether the title uses the original publication language (default value is yes) + + + + + + + + In a citation, the title of an article, encyclopedia entry, proceedings paper, or other item in a non-book publication. + + + + + + + + + + + + + + + + + + In a citation, an author's name. + + + + + + + + + + + + + + + + + + + + + + + + + A wrapper for one or more ++++s in a ++++. + + + + + + + + + + + + + + + + whether the items in this bibliography are cited in the article (default value is yes) + + + + + + + + + bibliography style hint (default value is numbered) + + numbered + nameDate + + + + + + + + A bibliography. + + + + + + + + + + + + + + + + + + + + + + + + + + + whether the items in this section are cited in the article + + + + + + bibliography style hint + + numbered + nameDate + + + + + + + + A division of the bibliography. + + + + + + + + + + + + + + + + + + + + + + + + In a book citation, a series title. + + + + + + + + + + + + + + + + + + + + + whether the title uses the original publication language (default value is yes) + + + + + + + + In a book citation, the book title. + + + + + + + + + + + + + + + + + + In a citation, the title of a chapter. + + + + + + + + + + + + + + + + + + + + + + A citation in a ++++, ++++, or as an inline reference. + + + + + + + + + + + + + + + + + In a legal case ++++, the name of the defendant. + + + + + + + + + + + + + + + + + + + + + In a book citation, the edition. + + + + + + + + + + + + + + + + whether the editor is an editor of a series (absence does not imply either value) + + + + + + + + In a citation, an editor's name. + + + + + + + + + + + + + + + + In a citation, an ID for citing an electronic publication. + + + + + + + + + + + the numeric value + + + + + + + In a citation, an issue or supplement number. + + + + + + + + + + + + + + + + In a journal citation, the journal title. + + + + + + + + + + + + + + + + whether the title uses the original publication language (default value is 'yes') + + + + + + + + In a citation, the title of a publication that is not one of the specific citation types. + + + + + + + + + + + + + + + + + + In a legal case citation, a plaintiff. + + + + + + + + + + + + + + + the year in YYYY format + + + + + + In a citation, the publication year. + + + + + + + + + + + In a statute citation, a statute title. + + + + + + + + + + + + + + + + + + the numeric value + + + + + + + In the bibliography, a volume number. + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-cookery.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-cookery.rng new file mode 100644 index 0000000..571cd22 --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-cookery.rng @@ -0,0 +1,286 @@ + + + + + + + + + + + + + a category for the recipe + + + + + the difficulty of the recipe + + + + the origin of the recipe + + + + + + A (culinary) recipe. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A (culinary) recipe's yield, i.e., the number of servings or amount that the recipe makes. + + + + + + + + + + + + + + + + + + + + + The time required to make all, or a part of, a recipe. + + + + + + + + + + + + + + + + A (culinary) recipe's introduction. + + + + + + + + + + + + + + + + A list of ingredients for a ++++. + + + + + + + + + + + + + + + + + + + A (culinary) recipe's nutrition information, such as calories and diet exchanges. + + + + + + + + + + + + + + + + The procedure part of a (culinary) recipe. + + + + + + + + + + + + + + + + + + + A (culinary) recipe's tools. + + + + + + + + + + + + + + + + a category for the recipe + + + + + + + A variation of a (culinary) recipe. + + + + + + + + + + + + + + + + A wrapper for one or more variations of a (culinary) recipe. These typically appear in print following the procedure ( ++++). + + + + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-core.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-core.rng new file mode 100644 index 0000000..2ca0063 --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-core.rng @@ -0,0 +1,1371 @@ + + + + + + + + + + + + + + a URI or IRI identifying the scheme of the ID and providing the id itself + + + + + + + wng:accessionId + An accession key that can be used to access a record in a database (e.g., GenBank, PubMed). + + + + + + + + + + + + + + + + + + + + + + + + + + + + A generic element for one of five types of out-of-flow (floating) objects. The type of block is specified in the type attribute. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A generic element for several kinds of in-flow (fixed) objects. The type of blockFixed is specified in the type attribute. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + wng:caption + A caption for a figure or other displayed block or supporting information item. + + + + + + + + + + + + + + + + + + + + + + whether this chemical structure is numbered (default value is yes) + + + + + + URI for the original displayed structure of which this is a repeat + + + + + !suppress! + + + + + + + + whether the contained image has a number as part of its picture (default value is no) + + + + + + + + A single displayed chemical structure. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + whether the content should be monospaced (default is 'yes') + + yes + no + + + + + + + + A portion of computer code (or pseudo-code). + + + + + + + + + + + + + + + + wng:data + The data of a ++++. This might consist of SVG, CML, TeX/LaTeX (for legacy products), or any other resource type permitted by Wiley. + + + + + + + + + + + + + whether this item is numbered (default value is yes) + + + + + + URI for the original displayed equation of which this is a repeat + + + + + !suppress! + + + + + + + + additional labeling text + + + + + + + A displayed and usually numbered mathematical equation, chemical reaction, or similarly treated piece of text. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + whether this feature is numbered (default is yes) + + + + + + + + + + + + + Is this feature at the level of a section? (default is 'no') + + + + + + Categorization used by PD + + + + + + + A (floating) feature. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Categorization used by PD + + + + + + + A feature that is part of the flow of the main text and must remain in place for the text to make sense. + + + + + + + + + + + + + The field name + + + + Whether this field is optional for presentation (default is 'yes') + + + + + + + + One of the pieces of information held in a ++++. + + + + + + + + + + + + + + + + + A figure or other image-based floating figure-like object, typically numbered. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A constituent part of a multi-file figure where each part has a separate caption. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + wng:infoAsset + An item of text with semantic significance, such as a protein name, chemical name, brand name, etc. + + + + + + + + + + + + + + + + + a URI or IRI reference for the resource + + + + an explicit path for the resource + + + + + extra information concerning the resource, such as its availability + + + + an alternative description for accessibility requirements + + + + resource width + + + + + resource height + + + + + + + wng:inlineGraphic + A small graphic in line with the text, not set off from it. + + + + + + + + + + + + + + + + + whether this line counts towards the numbering scheme (default value is yes) + + + + + + whether the line number of this line should be displayed (default is yes) + + + + + + indentation level + + 1 + 2 + 3 + 4 + 5 + flushRight + + + + + + + + wng:line + In ++++, a line of text. + + + + + + + + + + + + + + + + + + + + + + link actuation behavior (default value is replaceOnRequest) + + replaceOnRequest + newOnRequest + newOnLoad + embedOnRequest + + + + + A URI, IRI, or relative reference (or sequence of same) giving the names(s) or location(s) of the object(s) + + + + presentational hinting (default value is assoc) + + + + + root file for a compound media object + + + + + + + A link (cross-reference) to another location within the document or to a website associated with the document, or to an external document (or a location within one). + + + + + + + + + + + + + + + + + + + Use default white space processing or preserve source spacing (default value is 'default') + + default + preserve + + + + + + whether or not line numbers will be generated (default value is 'no') + + + + + + the frequency at which generated line numbers will appear (default value is '1') + + + + + + + wng:lineatedText + Text where line breaks are significant, such as poetry or computer code. + + + + + + + + + + + + + An attribution or date on a ++++. + + + + + + + + + + + + + + + + + + + + + + whether this item is to be counted in the numbering scheme (default value is yes) + + + + + + + + A mathematical theorem, proof, etc. + + + + + + + + + + + + + + + + + + + + MIME type indicating the specification of the resource's format + + + + + a URI, IRI, or relative reference giving the name or location of the resource (if resource is not provided as element content) + + + + + the version of the specification used by the resource + + + + + + thumbnail, archive, etc. + + + + an alternative description for accessibility requirements + + + + reference to a resource containing a longer description of an image, for accessibility requirements + + + + + !suppress! + + + + + Extra information for the type of resource (e.g., whether it is color). Pre-defined values: 'missing', 'missingGifOrJpg', 'notImageGallery', 'animation', 'audio', 'dataset', 'illustration', 'interactivity', lecturePresentation', 'photo', 'realia', 'tutorial', 'video', 'webResource', 'map', 'color', 'clGraphicAdditional', 'clGraphicGenerated'. + + + + + root file for a compound media object + + + + + resource width + + + + + resource height + + + + + + + wng:mediaResource + An external resource such as an image file or an inline resource such as some SVG code. + + + + + + + + + + + + + + + + + + + wng:mediaResourceGroup + A wrapper for ++++s that refer to, or are renditions of, the same resource. + + + + + + + + + + + + + + + + For specializing the use of the element. Pre-defined value: 'annotation'. + + + + + + + + a number or other label for this paragraph + + + + + + + A paragraph or portion of text. + + + + + + + + + + + + + + + + + + + + whether this item is to be counted in the numbering scheme + + + + + + + + + + + + + + + + + A container for ++++s, representing information corresponding to that stored in a database record. + + + + + + + + + + + + + + + + upper left X coordinate of a rectangular region, given on a scale of 0 (left edge of resource) to 1 (right edge) + + + upper left Y coordinate of a rectangular region, given on a scale of 0 (top edge of resource) to 1 (bottom edge) + + + lower right X coordinate of a rectangular region, given on a scale of 0 (left edge of resource) to 1 (right edge) + + + lower right Y coordinate of a rectangular region, given on a scale of 0 (top edge of resource) to 1 (bottom edge) + + + + + + A rectangular region defined by relative coordinates (0.0 to 1.0) in an image. + + + + + + + + + + + + + A salutation in a letter. + + + + + + + + + + + + + + + + + + + + + + + whether this section is numbered (only needed if different from inherited value) + + + + + + + + + + + A section. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + wng:source + A source of a quotation or content that has previously been published elsewhere. + + + + + + + + + + + + + + + + + + + + + + + For specializing the use of the element. Pre-defined values: 'abbreviation', 'key', 'displayForm'. + + + + + reference to a definition + + + + + !suppress! + + + + + whether term is emphasized (default value is 'yes') + + + + + + + + + + + wng:term + A term or abbreviation. + + + + + + + + + + + + + + + + + + + + + + + whether to show this definition inline (default value is 'no') + + + + + + + + wng:termDefinition + Definition of a term or expansion or an abbreviation or acronym. + + + + + + + + + + + + + + + + + + + + + A container for different versions of the same ++++. + + + + + + + + + + + + + for collation + + + + + + + the type of the title + + + + plain text version of title for an audio reader, for example, where math or graphics is substituted by a description + + + + + URI to provide a link for this specific instance of the occurrence of an associated icon declared as iconRef for the title type + + + + + entry point if the href URI is a compound media object + + + + + + + A title of a publication or of some item in it. + + + + + + + + + + + + + + + the URL reference, conforming to RFC 3986. + + + + + + A URL or IRL. + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-high.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-high.rng new file mode 100644 index 0000000..d88c6cf --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-high.rng @@ -0,0 +1,303 @@ + + + + + + + + + + + + + + + + + + whether the appendix is numbered (only needed if different from value of 'sectionsNumbered' on <body>) + + + + + + + + An appendix within a chapter or article. + + + + + + + + + + + + + + + + + + + whether the sections, appendices, and protocols of this body are numbered (unless overridden on those elements) + + + + + + + The narrative content of a document. + + + + + + + + + + + + + + + + + + + + + + + the model version, which must be '2.0' + 2.0 + + + + + + + A root element (top-level structure) that is divided into a <header> and a <body>. + + + + + + + + + + + + + + + + + + + + + + + + + An article-level list of abbreviations, acronyms, or terms and their definitions or expansions. + + + + + + + + + + + + + + + + + the type of term to be used in this structure + + + + + presentational hinting + + + + + + + An automatically-generated glossary, list of abbreviations, acronyms, etc. + + + + + + + + + + + + + + + + + + + + + Container for the non-narrative portions of a document (metadata, title, authors, abstract, etc.). + + + + + + + + + + + + + + + + + + + + responseToDoi + the DOI of the letter this responds to in another article (if it exists) + + + + + + + A letter to the editor or other article that constitutes a single piece of correspondence. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Reproduction of a complete component (typically a journal article) within another, for commentary or historical reasons. + + + + + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-index.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-index.rng new file mode 100644 index 0000000..b031fbe --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-index.rng @@ -0,0 +1,118 @@ + + + + + + + + + + + + + wng:index + An index. + + + + + + + + + + + + + + + + + + + reference to one or more other index entries that provide the information for this item + + + + + !suppress! + + + + + reference to one or more other index entries that have additional relevant information + + + + + !suppress! + + + + + sort key for the entry + + + + + + + wng:indexEntry + An entry in an index. + + + + + + + + + + + + + + + + + + + + + wng:indexEntryGroup + A section in an index. + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-lists.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-lists.rng new file mode 100644 index 0000000..7d8e17d --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-lists.rng @@ -0,0 +1,208 @@ + + + + + + + + + + + + + + list enumeration style (default value is plain) + + + + + + additional formatting (default value is hanging) + + hanging + paragraph + + + + + + hint whether to render compactly (default value is yes) + + + + + + points to any earlier list this list is a continuation of + + + + + !suppress! + + + + + + + A list. + + + + + + + + + + + + + + + + + + + + + + + + + + + An item in a list. + + + + + + + + + + + + + + + + + + + + + + An associated pair (or more) of list items. + + + + + + + + + + + + + + + + + hints whether to render compactly (default value is yes) + + + + + + list enumeration style (default value is plain) + + + + + + points to any earlier listPaired this listPaired is a continuation of + + + + + !suppress! + + + + + point to any listPaired this list should use the same column alignment as + + + + + !suppress! + + + + + + + A list of pairs of items, such as terms and definitions. + + + + + + + + + + + + + + + + + + + List headings for a listPaired. + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-manifest.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-manifest.rng new file mode 100644 index 0000000..ba04949 --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-manifest.rng @@ -0,0 +1,170 @@ + + + + + + + + + + + + + + + + + + + + + the model version, which must be '2.0' + 2.0 + + + + + + + A composite representing a standalone file or item within it. + + + + + + + + + + + + + + + + + + + + + + + + + + + A URI or IRI giving the location of the resource + + + + root file for a compound media object + + + + + A URI or IRI giving a location in another document of where to place the resource before + + + + + !suppress! + + + + + A URI or IRI giving a location in another document of where to place the resource after + + + + + !suppress! + + + + + + + Specification of a resource to be included and/or identified in the ++++. + + + + + + + + + + + + + + + + + + + + + + + + + + + Additional metadata in a ++++. + + + + + + + + + + + Wrapper for ++++ items. + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-metadata.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-metadata.rng new file mode 100644 index 0000000..c8cafd8 --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-metadata.rng @@ -0,0 +1,2197 @@ + + + + + + + + + + + + + + + + + + + + + + + + + An abstract of an article, chapter, etc. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A container for one or more ++++s. + + + + + + + + + + + + + wng:address + Address information in an affiliation. + + + + + + + + + + + + + + + + + + + + + + the type of the affiliation (default value is organization) + + + + + + + The affiliation details of a creator. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A wrapper for ++++s. + + + + + + + + + + + + + + + specifies an element for which an attribute value will be constrained + + + specifies an attribute for which an attribute value will be constrained + + + specifies an allowed value for a given element/attribute + + + + URI or IRI for an icon associated with this value + + + + + presentational hint if associated icon should only appear in certain output types (print, online) + + + + + !suppress! + + + + + description of intended appearance or other information for this value + + + + + + + Declares a value for a given attribute on a given element when the attribute is not constrained by controlled values and when the value is not a 'predefined' value. + + + + + + + + + + + A wrapper for ++++s. + + + + + + + + + + + + + + + + A wrapper for biographical information about an article, chapter, or book creator (typically an author). + + + + + + + + + + + + + + + + + + + + + + + + + + + In metadata, a city (part of an ++++). + + + + + + + + + + + + + + + + In metadata, a creator's contact details. + + + + + + + + + + + + + + + + + + + + + + + + + + + A container element for chapter/article-level metadata that is inherent to the article/chapter, not an artifact of its publication. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ownership + the entity that owns the copyright (default value is publisher) + + + + + + + In metadata, a copyright statement. + + + + + + + + + + + + + + + + In metadata for a journal article, information about the corresponding author(s). + + + + + + + + + + + + + + + + + type + the type of item or property being counted + + + the numeric value + + + the number (count) of that type of item + + + + + + In metadata, a count of a particular type of item. + + + + + + + + + + In metadata, a wrapper for + elements. + + + + + + + + + + + + + In metadata, a country (part of an ++++). + + + + + + + + + + + In metadata, a state, county, province or other national sub-entity (part of an ++++). + + + + + + + + + + cover start date in ISO 8601 format (see http://www.w3.org/TR/NOTE-datetime) + + + + + + The cover date of a journal issue. + + + + + + + + + + + + the creator's role (default value is author) + + + + + points to the current affiliation of the creator if this is different from their affiliation when they authored the content + + + + + !suppress! + + + + + points to the parts of the document authored by this creator if the creator is not responsible for the whole document + + + + + is this an author for correspondence? (default value is no) + + + + + + !suppress! + + + + + points to the affiliation(s) for this creator + + + + + !suppress! + + + + + points to a participated-in collaboration(s) for this creator + + + + + !suppress! + + + + + points to a note(s) for this creator + + + + + !suppress! + + + + + + + In metadata, a creator (e.g., author) of the document's content. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In metadata, the creators (e.g., authors) of the document's content. + + + + + + + + + + + + + + + + + + + + + wng:dedication + In content metadata, typically for a journal article, a dedication. + + + + + + + + + + + + + + + + + In metadata, part of a person's structured name showing their degrees and qualifications. + + + + + + + + + + + who registered the DOI (defaults to wiley) + + + + + whether the DOI is registered + + + + + + + In metadata, a Digital Object Identifier. + + + + + + + + + + + usable form of the email address if the element content differs from this (e.g., if anti-spam obscured or if the email address contains character entities) + + + + + + + An email address. + + + + + + + + + + + + When a compound should receive special treatment, use relevance='primary'. If 'primary' is used, then 'secondary' must be used on other enrichedObjects. + + primary + secondary + + + + + + A space-separated list of URIs identifying information about the object, such as a region of a graphic or a section of text. + + + + + + + Detailed information about a chemical compound or (in future) other types of objects. + + + + + + + + + + + + + + + + + + + + + + + + + In metadata, a wrapper for ++++s. + + + + + + + + + + + + + the date (and optionally, time) of the event in ISO 8601 format (see http://www.w3.org/TR/NOTE-datetime) + + + + who/what effected the event + + + + + + + In metadata, an event, its date, and optionally the agent (human or system) that performed the action. + + + + + + + + + + In metadata, a wrapper for ++++s. + + + + + + + + + + + + + In metadata or in a citation, part of a person's structured name. + + + + + + + + + + + + + + + + In metadata or in a citation, prefix to the family name of a person. + + + + + + + + + + + In metadata, a fax number. + + + + + + + + + + + the funding agency's name as given in the controlled FundRef taxonomy + + + + + the DOI assigned by FundRef for the funding agency + + + + + + + The name of an organization (contract/grant sponsor) that provided sponsorship for the work. + + + + + + + + + + + + + + + + In metadata, a wrapper for information about the funding of an article (e.g., contract or grant sponsors; contract or grant numbers). + + + + + + + + + + + + + + In metadata, a contract or grant number that supported some or all of the work described. + + + + + + + + + + + In metadata or in a citation, part of a person's structured name. + + + + + + + + + + + + + + + + + + + In metadata or in a citation, the name of a creator or an authoring entity that is not a person (e.g., groups, collaborations, corporations, etc.). + + + + + + + + + + + + + + + + In metadata, part of a person's structured name. + + + + + + + + + + + value + the identifier value + + + + + + In metadata, an identifier associated with the content. + + + + + + + + + + In metadata, a wrapper for id elements. + + + + + + + + + + + + + + + + + + In metadata, a ++++'s job title. + + + + + + + + + + + + + + + + + + + In metadata, an ISBN (International Standard Book Number). + + + + + + + + + + + + + + In metadata, an ISSN (International Standard Serial Number). + + + + + + + + + + + + + + In metadata, a keyword associated with the document's content. + + + + + + + + + + + + + + + + + + + + + + + + + + In metadata, a wrapper for + elements. + + + + + + + + + + + + + + + + + + + the content to which this learning objective applies + + + + + !suppress! + + + + + + + In metadata, a learning objective associated with some or all of a document's content. + + + + + + + + + + + + + + + + + + + + + + + + + + In metadata, a wrapper for + elements. + + + + + + + + + + + + + + + + + + + + + + + + In metadata, a legal statement (disclaimer, etc). + + + + + + + + + + + + + A collection of links to resources associated with a product, unit, or exercise. + + + + + + + + + + + + + wng:MedLinePubType + In metadata, a MedLine publication type. + + + + + + + + + + + wng:MeSHcheckWord + In metadata, a Medical Subject Heading check word. + + + + + + + + + + + wng:MeSHheading + In metadata, a container for Medical Subject Heading descriptor and qualifiers. + + + + + + + + + + + + + set to 'yes' for focus headings that consist of the descriptor only + + + + + + + wng:MeSHdescriptor + In metadata, a Medical Subject Heading descriptor. + + + + + + + + + + set to 'yes' for focus headings that consist of the descriptor plus the qualifier + + + + + + + wng:MeSHqualifier + In metadata, a Medical Subject Heading qualifier. + + + + + + + + + + + wng:MeSHterms + In metadata, a wrapper for MeSH +(Medical Subject Headings) semantics. + + + + + + + + + + + + + + + + + + + + + + + + + + wng:nameAlternative + In metadata, an alternative form of a ++++'s name, such as presentation in Chinese characters. + + + + + + + + + + + + + + + + In metadata or in a citation, a generational suffix on a name. + + + + + + + + + + + + the arabic form of the number, if its printed form differs + + + + + + + In metadata, numbering information, such as a journal volume or page number. + + + + + + + + + + + In metadata, a wrapper for + information. + + + + + + + + + + + + + + + the name of the element whose type is constrained + + + + hint to rendering application whether to display the name (default value is 'yes') + + + + + + URI or IRI for an icon associated with this object name + + + + + presentational hint if object or associated icon should only appear in certain output types (print, online) + + + + + !suppress! + + + + + + + In metadata, a permitted name for certain types of objects. + + + + + + + + + + + + + + + + + + + + + In metadata, a wrapper for + elements. + + + + + + + + + + + + + + + + + + In metadata, an organizational division within an affiliation, such as a university department. + + + + + + + + + + + + + + + + + + + + + In metadata, an organization's name. + + + + + + + + + + + + + + + + In citations, the first page of a cited item. + + + + + + + + + + + In citations, the last page of the cited item. + + + + + + + + + + + wng:personName + In metadata, the name of a person. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + the level of publication being described + + + + sequences/positions the object indicated by the 'level' attribute within the publication + + + + + indicates the access type of the article + + + + + production status + + + + + pagination type (default is 'contiguous') + + contiguous + unit + none + + + + + + + + A container for metadata about the publication, as distinct from its content. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In metadata, a wrapper for information about the publisher. + + + + + + + + + + + + + + In metadata or a citation, a publisher's location. + + + + + + + + + + + In metadata or a citation, a publisher's name. + + + + + + + + + + + + + + + + In metadata, a telephone number for an author, editor, or other creator. + + + + + + + + + + + In metadata, a postal code or ZIP code (part of an ++++). + + + + + + + + + + + In metadata, a wrapper for different forms of citation pertaining to the article itself. + + + + + + + + + + + + + In metadata, a line of an ++++. + + + + + + + + + + + + + + + + + + a URI identifying the subject + + + + the content to which this subject applies + + + + + !suppress! + + + + + + + In metadata, subject information. + + + + + + + + + + + + + + + + + + + + + In metadata, a wrapper for subject information. + + + + + + + + + + + + + + + + + + + + + + + + Details of supporting information (supplementary material) for an article or chapter. + + + + + + + + + + + + + + + + + + + + + + Used for EMBO journals. Presence of this attribute indicates that this supportingInfoItem concerns the peer review process. The value 'yes' indicates that the item shows the reviews (without reviewer names) and the value 'no' means it is a statement that the peer reviews are not public. + + + + + + + + An item of supporting information (supplementary material). + + + + + + + + + + + + + + + + + A wrapper for different forms of title. + + + + + + + + + + + + + In metadata, part of a person's structured name. + + + + + + + + + + + wng:unparsedAffiliation + In metadata, a creator's unstructured affiliation. DEPRECATED: ONLY FOR USE IN LEGACY CONTENT. + + + + + + + + + + + + + + + + + + + + + + + + + + + + In metadata, unstructured editorial information. DEPRECATED: ONLY FOR USE IN LEGACY CONTENT. + + + + + + + + + + + + + + + + + + + + + wng:unparsedName + In metadata, an unstructured name. DEPRECATED: ONLY FOR USE IN LEGACY CONTENT. + + + + + + + + + + + + + + + + the number of levels of section headings to use (default is 1) + + + + + + + In ++++s, this expands to a listing of section titles, down to a given level. + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-notes.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-notes.rng new file mode 100644 index 0000000..82b5c5b --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-notes.rng @@ -0,0 +1,126 @@ + + + + + + + + + + + allows overrides of automated additions to label value, such as prepending chapter number (default value is 'formatted'). Mainly for use on equations, chemical structures, and notes. + + + + + + + An element for overriding generated content in numbered or sequentially labeled items. + + + + + + + + + + + + + + + + + + + + For specializing the use of the element. Pre-defined values: 'marginText', 'onlineInline', 'translation'. + + + + + + + + whether this note is numbered (default value is yes, except for type='marginText', where default is no). + + + + + + whether this note has its indicator in a graphic and therefore the indicator should only be generated on the note itself (default value is no). + + + + + + + + A footnote, endnote, or marginal note. + + + + + + + + + + + + + + + + + + + + + + + A wrapper for ++++s. + + + + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-patterns.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-patterns.rng new file mode 100644 index 0000000..54764fc --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-patterns.rng @@ -0,0 +1,363 @@ + + + + + + + + + + + yes + no + + + + + + + + + + + + + + + + + + + yes + no + unknown + + + + + for specializing the use of the element + + + + + + + print publishing rights (default value is yes). See usage guideline above. + + + + + + online publishing rights (default value is yes). See usage guideline above. + + + + + + copyright for this element content + + + + + + + 1 + I + i + A + a + plain + bulleted + custom + checkbox + + + + + + + for specializing the use of the element + + + + + specifies whether content of the element should be restricted to appearing only in print or only online. Omission of this attribute means the content is to appear in all renderings. + + + + + the intended audience (default value is both) + + + + + A Globally Unique Identifier) + + + + + !suppress! + + + + + !suppress! + + + + + !suppress! + + + + + !suppress! + + + + + !suppress! + + + + + + !suppress! + + + + + + + an xml:id compliant identifier + + + + + the language/dialect of the element content as a two-letter lowercase code conforming to ISO 639-1 (see second column of http://www.loc.gov/standards/iso639-2/php/code_list.php) If the attribute is omitted, the value is inherited from the nearest ancestor bearing an xml:lang value. + + + + + ISO 3166 two-letter country code + + + + + + specifies a number at which to resume automated numbering after an irregularity + + + + + + an ISO 8601 date + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-protocols.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-protocols.rng new file mode 100644 index 0000000..d735a5d --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-protocols.rng @@ -0,0 +1,192 @@ + + + + + + + + + + the type of protocol + + basic + alternate + support + + + + + + + whether this protocol is numbered (only needed if different from 'sectionsNumbered' value on <body>) + + + + + + + + A protocol (laboratory procedure). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + In a laboratory ++++, a description of its materials. + + + + + + + + + + + + + + + + + + + + + + + In a laboratory protocol, a list of ingredients and instructions for producing a reagent or solution. + + + + + + + + + + + + + + + + + + + + + + + A wrapper for a group of ++++s, with an optional title. + + + + + + + + + + + + + + + + + + + + + + + + + + + A numbered step within a laboratory ++++. + + + + + + + + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-questions.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-questions.rng new file mode 100644 index 0000000..632de59 --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-questions.rng @@ -0,0 +1,749 @@ + + + + + + + + + + + + + + + An answer to a ++++. + + + + + + + + + + + + + + + + + + + + + + + + significant digits + + + + + tolerance + + + + + width of the line, as a percentage of the page width, given as a number 0 - 100 + + + + + + + In a fill-in-the-blanks ++++, a blank to be filled in. + + + + + + + + + + + + + + + + + + + + + + + + + + + whether the choice represents a correct answer + + + + + + + + A choice in a multiple choice, multiple response, true-false, or fill-in-the-blank question. + + + + + + + + + + + + + + + + + + + + style with which to render choices (default value is a) + + hotspot + 1 + I + i + A + a + plain + bulleted + custom + + + + + + whether there may be multiple correct answers among the contained choices (default value is no) + + + + + + + + A wrapper for ++++s. + + + + + + + + + + + + + + + + + + + classification of the question in WileyPLUS + + + + + presence of this attribute indicates that the exercise is not accessible and its value points to an alternative that is accessible + + + + + !suppress! + + + + + points to instructions for this exercise + + + + + !suppress! + + + + + whether this item is to be counted in the numbering scheme (default value is yes) + + + + + + + + + placement hint for answer (not worked solution) and any explanation, in print (default value is here) + + here + margin + chapter + back + instructorManual + + + + + + placement hint for answer (not worked solution) and any explanation, in online presentation (default value is here) + + here + button + + + + + + number of difficulty indicators (default 0) + + 0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + + + + + + + + An exercise ( +and optionally ++++, ++++, and ++++). + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + A titled part of the question description for an exercise. + + + + + + + + + + + + + + + + + + + + + + + + + + The initial part of the description for an exercise question. + + + + + + + + + + + + + + + + + + + + + + + + + + + + wng:exerciseInstruction + An instruction applying to a set of exercises. + + + + + + + + + + + + + + + + + + + An exercise with parts. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + whether this item is numbered (default value is yes) + + + + + + string to be prepended to the label/number of each <exercise> within this <exerciseSection> + + + + + + + wng:exerciseSection + A section of questions or other ++++s. + + + + + + + + + + + + + + + + + + + + + + + + + + + + under what condition (user is right or wrong) to display this explanation + + ifRight + ifWrong + + + + + + + + Explanation about an ++++'s correctness. + + + + + + + + + + + placement hint for print (default value is here) + + here + margin + chapter + back + instructorManual + + + + + + placement hint for online presentation (default value is here) + + here + button + + + + + + + + A hint about how to answer a ++++. + + + + + + + + + + + + + + + + + + + + + + + + points to part of a matching pair (or triple) + + + + !suppress! + + + + points to part of a matching pair (or triple) + + + + !suppress! + + + + + points to part of a matching triple + + + + + !suppress! + + + + + + + A match between items giving part of the + in a matching ++++. + + + + + + + + + + + + + + + + Lists of potential matches in a matching exercise. + + + + + + + + + + + + + + + A question. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + placement hint for worked solution in print (default value is here) + + here + margin + chapter + back + instructorManual + studentSolutionsManual + + + + + + placement hint for worked solution in online presentation (default value is here) + + here + button + + + + + + + + A step-by-step solution for an answer in an ++++. + + + + + + + + + + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-table.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-table.rng new file mode 100644 index 0000000..0531c7c --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-table.rng @@ -0,0 +1,134 @@ + + + + + + + + + + + + + + + + + + + + + + A table. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Informal tabular material. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g-typo.rng b/dtd/Wileyml3gv20-rng/Wileyml3g-typo.rng new file mode 100644 index 0000000..6060f26 --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g-typo.rng @@ -0,0 +1,268 @@ + + + + + + + + + + + Bold text. + + + + + + + + + + + + + + + + + + + + + + + + Italic text. + + + + + + + + + + + + + + + + + + + + + + Fixed case: text that must always remain in its given case (upper or lower), regardless of context. + + + + + + + + + + + + + + + + + + + + + + Fixed italic: text that must always be presented in italic, regardless of context. + + + + + + + + + + + + + + + + + + + + + Fixed roman: text that must always be presented in roman (upright), regardless of context. + + + + + + + + + + + + + + + + + + + + + Text that is to be typeset or otherwise rendered in small capitals. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + CSS3 styling + + + + + + + wng:span + A span of text identified for special formatting or other purposes. + + + + + + + + + + + + + + + + wng:sub + Subscript text. + + + + + + + + + + + + + + + + + + + + + + Superscript text. + + + + + + + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/Wileyml3g.rng b/dtd/Wileyml3gv20-rng/Wileyml3g.rng new file mode 100644 index 0000000..675eeea --- /dev/null +++ b/dtd/Wileyml3gv20-rng/Wileyml3g.rng @@ -0,0 +1,180 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + row + col + + + + + + + + + + + + + + + + + + + 1 + 2 + 3 + 4 + 5 + + + + + + + + + port + land + + + + + + + + + + + + + + + !suppress! + + + + + + + !suppress! + + + + + + + !suppress! + + + + + + + !suppress! + + + + + + + !suppress! + + + + + diff --git a/dtd/Wileyml3gv20-rng/mathml3-common.rng b/dtd/Wileyml3gv20-rng/mathml3-common.rng new file mode 100644 index 0000000..0fab45a --- /dev/null +++ b/dtd/Wileyml3gv20-rng/mathml3-common.rng @@ -0,0 +1,233 @@ + + + + + + start = math + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + block + inline + + + + + + + + + + + + linebreak + scroll + elide + truncate + scale + + + + + + + + + + + + + + + + + + + + + + + top + middle + bottom + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \s*((-?[0-9]*(\.[0-9]*)?(e[mx]|in|cm|mm|p[xtc]|%)?)|(negative)?((very){0,2}thi(n|ck)|medium)mathspace)\s* + + + diff --git a/dtd/Wileyml3gv20-rng/mathml3-presentation.rng b/dtd/Wileyml3gv20-rng/mathml3-presentation.rng new file mode 100644 index 0000000..047524a --- /dev/null +++ b/dtd/Wileyml3gv20-rng/mathml3-presentation.rng @@ -0,0 +1,2265 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \s*([\+\-]?[0-9]*(\.[0-9]*)?\s*((%?\s*(height|depth|width)?)|e[mx]|in|cm|mm|p[xtc]|((negative)?((very){0,2}thi(n|ck)|medium)mathspace)))\s* + + + + + none + solid + dashed + + + + + top + bottom + center + baseline + axis + + + + + left + center + right + + + + + longdiv + actuarial + radical + box + roundedbox + circle + left + right + top + bottom + updiagonalstrike + downdiagonalstrike + verticalstrike + horizontalstrike + madruwb + + + + + + + + + + + + + + + + + \s*\S\s* + + + + + \s*((#[0-9a-fA-F]{3}([0-9a-fA-F]{3})?)|[aA][qQ][uU][aA]|[bB][lL][aA][cC][kK]|[bB][lL][uU][eE]|[fF][uU][cC][hH][sS][iI][aA]|[gG][rR][aA][yY]|[gG][rR][eE][eE][nN]|[lL][iI][mM][eE]|[mM][aA][rR][oO][oO][nN]|[nN][aA][vV][yY]|[oO][lL][iI][vV][eE]|[pP][uU][rR][pP][lL][eE]|[rR][eE][dD]|[sS][iI][lL][vV][eE][rR]|[tT][eE][aA][lL]|[wW][hH][iI][tT][eE]|[yY][eE][lL][lL][oO][wW])\s* + + + + + left + center + right + decimalpoint + + + + + + + + + + + + (\s*\{\s*(left|center|right|decimalpoint)(\s+(left|center|right|decimalpoint))*\})*\s* + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + prefix + infix + postfix + + + + + + + true + false + + + + + + + true + false + + + + + + + + + + + + + + + + + true + false + + + + + + + true + false + + + + + + + + infinity + + + + + + + + + + + + true + false + + + + + + + true + false + + + + + + + true + false + + + + + + + auto + newline + nobreak + goodbreak + badbreak + + + + + + + + + + + + before + after + duplicate + infixlinebreakstyle + + + + + + + + + + left + center + right + auto + id + + + + + + + + + + + + + + + + + left + center + right + auto + id + indentalign + + + + + + + + indentshift + + + + + + + left + center + right + auto + id + indentalign + + + + + + + + indentshift + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + auto + newline + nobreak + goodbreak + badbreak + indentingnewline + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + normal + bold + italic + bold-italic + double-struck + bold-fraktur + script + bold-script + fraktur + sans-serif + bold-sans-serif + sans-serif-italic + sans-serif-bold-italic + monospace + initial + tailed + looped + stretched + + + + + + + small + normal + big + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + thin + medium + thick + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + transparent + + + + + + + + + normal + bold + italic + bold-italic + double-struck + bold-fraktur + script + bold-script + fraktur + sans-serif + bold-sans-serif + sans-serif-italic + sans-serif-bold-italic + monospace + initial + tailed + looped + stretched + + + + + + + small + normal + big + + + + + + + + ltr + rtl + + + + + + + + + + + + + normal + bold + + + + + + + normal + italic + + + + + + + + + + + + + + + + + + transparent + + + + + + + + + + + + + + + + + + + + + + + left + right + + + + + + + + + + + + + + + + + left + center + right + decimalpoint + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ltr + rtl + + + + + + + + + + + + + + + + + + + thin + medium + thick + + + + + + + left + center + right + + + + + + + left + center + right + + + + + + + true + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + false + + + + + + + + + + + + + + + + + before + after + duplicate + + + + + + + + + + + + + + true + false + + + + + + + true + false + + + + + + + left + right + center + + + + + + + + + true + false + + + + + + + + + true + false + + + + + + + left + center + right + + + + + + + + loose + medium + tight + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + auto + + fit + + + + + + + + + + + none + updiagonalstrike + downdiagonalstrike + verticalstrike + horizontalstrike + + + + + + + + + left + center + right + + + + + + + + + + + + ltr + rtl + + + + + + + left + right + + + + + + + true + false + + + + + + + true + false + + + + + + + true + false + + + + + + + prefix + infix + postfix + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + left + center + right + auto + id + + + + + + + left + center + right + auto + id + indentalign + + + + + + + left + center + right + auto + id + indentalign + + + + + + + + + + + + + indentshift + + + + + + + + indentshift + + + + + + + + + + + + true + false + + + + + + + + + + + + + + + + + auto + newline + nobreak + goodbreak + badbreak + + + + + + + + + + before + after + duplicate + infixlinebreakstyle + + + + + + + + + + + + + thin + medium + thick + + + + + + + w + nw + n + ne + e + se + s + sw + + + + + + + lefttop + stackedrightright + mediumstackedrightright + shortstackedrightright + righttop + left/\right + left)(right + :right=right + stackedleftleft + stackedleftlinetop + + + + + + + + + + + + + + + small + normal + big + + + + + + + + normal + bold + italic + bold-italic + double-struck + bold-fraktur + script + bold-script + fraktur + sans-serif + bold-sans-serif + sans-serif-italic + sans-serif-bold-italic + monospace + initial + tailed + looped + stretched + + + + + + + + infinity + + + + + + + + + + + + + + + + + true + false + + + + + + + + thin + medium + thick + + + + + + + + + + left + center + right + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + false + + + + + + + + + + + + + + + left + right + leftoverlap + rightoverlap + + + + + + + left + center + right + decimalpoint + + + + + + + true + false + + + + + + + + + + + + + + + + + true + false + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + false + + + + + + + left + right + center + + + + + + + + + + + + + + + + + + true + false + + + + + + + left + right + center + + + + + + + + + + + + + + + + + + + true + false + + + + + + + true + false + + + + + + + left + right + center + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \s*(top|bottom|center|baseline|axis)\s*[0-9]* + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + false + + + + + + + + + + + auto + + fit + + + + + + + + + auto + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + true + false + + + + + + + true + false + + + + + + + true + false + + + + + + + left + right + leftoverlap + rightoverlap + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + top + bottom + center + baseline + axis + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + top + bottom + center + baseline + axis + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \s*(top|bottom|center|baseline|axis)\s*[0-9]* + + + + + + + left + center + right + decimalpoint + + + + + + + left + center + right + + + + + + + + loose + medium + tight + + + + + + + + + + + + + + + + + + + + lefttop + stackedrightright + mediumstackedrightright + shortstackedrightright + righttop + left/\right + left)(right + :right=right + stackedleftleft + stackedleftlinetop + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + w + nw + n + ne + e + se + s + sw + + + + + + + + + none + updiagonalstrike + downdiagonalstrike + verticalstrike + horizontalstrike + + + + + + + + + + + + + + + + + + + + + + + + + + w + nw + n + ne + e + se + s + sw + + + + + + + + + none + updiagonalstrike + downdiagonalstrike + verticalstrike + horizontalstrike + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/Wileyml3gv20-rng/mathml3.rng b/dtd/Wileyml3gv20-rng/mathml3.rng new file mode 100644 index 0000000..98c3fd9 --- /dev/null +++ b/dtd/Wileyml3gv20-rng/mathml3.rng @@ -0,0 +1,22 @@ + + + + + Content MathML +include "mathml3-content.rnc" + Presentation MathML + + + math and semantics common to both Content and Presentation + + diff --git a/dtd/Wileyml3gv20-rng/oasisExchangeTable.rng b/dtd/Wileyml3gv20-rng/oasisExchangeTable.rng new file mode 100644 index 0000000..2a154fb --- /dev/null +++ b/dtd/Wileyml3gv20-rng/oasisExchangeTable.rng @@ -0,0 +1,547 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + top + bottom + topbot + all + sides + none + + + + + + + + + + + + Part of Wiley's customized OASIS Exchange Table Model ++++. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Part of Wiley's customized OASIS Exchange Table Model ++++. + + + + + + + + + + + + + + + + + + + + + left + right + center + justify + char + + + + + + + + + + Information about a column in a table, part of Wiley's customized OASIS Exchange Table Model ++++. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + left + right + center + justify + char + + + + + + + + + + + + + + + + Part of Wiley's customized OASIS Exchange Table Model ++++. + + + + + + + + + + + + top + middle + bottom + + + + + + + + + + Part of Wiley's customized OASIS Exchange Table Model ++++. + + + + + + + + + + top + middle + bottom + + + + + + + + + + Part of Wiley's customized OASIS Exchange Table Model ++++. + + + + + + + + + + + + + + + top + middle + bottom + + + + + + + + + + An entry in a table, part of Wiley's customized OASIS Exchange Table Model ++++. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + left + right + center + justify + char + + + + + + + + + + + + + top + middle + bottom + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/XHTMLtablesetup.ent b/dtd/journal-publishing-dtd-2.3/XHTMLtablesetup.ent new file mode 100644 index 0000000..9212f92 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/XHTMLtablesetup.ent @@ -0,0 +1,372 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%xhtml-inlstyle-1.mod; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%xhtml-table-1.mod; + + + diff --git a/dtd/journal-publishing-dtd-2.3/annotation.ent b/dtd/journal-publishing-dtd-2.3/annotation.ent new file mode 100644 index 0000000..2421982 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/annotation.ent @@ -0,0 +1,274 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dtd/journal-publishing-dtd-2.3/articlemeta.ent b/dtd/journal-publishing-dtd-2.3/articlemeta.ent new file mode 100644 index 0000000..e240604 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/articlemeta.entdiff --git a/dtd/journal-publishing-dtd-2.3/backmatter.ent b/dtd/journal-publishing-dtd-2.3/backmatter.ent new file mode 100644 index 0000000..c6e63c0 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/backmatter.ent @@ -0,0 +1,387 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/catalog-v2.xml b/dtd/journal-publishing-dtd-2.3/catalog-v2.xml new file mode 100644 index 0000000..c5a4153 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/catalog-v2.xmldiff --git a/dtd/journal-publishing-dtd-2.3/catalog.ent b/dtd/journal-publishing-dtd-2.3/catalog.ent new file mode 100644 index 0000000..cb9ebf7 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/catalog.ent @@ -0,0 +1,199 @@ +-- ARCHIVING AND INTERCHANGE DTD SUITE -- +-- MODULAR LIBRARY (Version 2.3) -- +-- OASIS SOCAT FILE -- +-- -- +-- With DTDs for Publishing, Archiving, Authoring, -- +-- Book, Book Collection, and Historical DTDs -- +-- -- +-- Change History: -- +-- -- +-- February 2007: (Suite Version 2.3) -- +-- 11. Major and minor changes to all DTDs as a result of the -- +-- October & November 2006 / January 2007 Working Group -- +-- meetings. All modules changed to 2.3. This is the last -- +-- version before any non-backwards compatible changes. -- +-- -- +-- OCTOBER 2006: (Suite Version 2.2) -- +-- 10. Changed the date of the Book DTD, the Book Collection -- +-- DTD, the Historical Book DTD, and book-related modules -- +-- (excepting historical book modules) from -- +-- "v2.1 20050630" to "v2.2 20061013" -- +-- -- +-- APRIL 2006: (Suite Version 2.2) -- +-- 9. Changed the date of all the changed modules from -- +-- "v2.1 20050630" to "v2.2 20060430" -- +-- -- +-- DECEMBER 2005: (Suite Version 2.1) -- +-- 8. CORRECTED TYPOs - changed "ISOlat" to "isolat" to match -- +-- existing filenames; changed July 2005 change numbers -- +-- -- +-- JULY 2005: (Suite Version 2.0) -- +-- 7. ISO Entities - New directory structure/new modules -- +-- 6. MathML - New directory structure/new modules -- +-- mathml2-qname-1.mod moved up to top level directory -- +-- mathml2-qname-1.mod moved up to top level directory -- +-- -- +-- Jan 2005: -- +-- 1. Double quotation mark removed after -- +-- "ENTITIES Numeric and Special Graphic for MathML 2.0//EN" -- +-- 2. Filename "journalpubcustomize-models.ent" replaced with -- +-- "journalpubcustom-models.ent" -- +-- 3. Filename "archivecustomize-models.ent" replaced with -- +-- "archivecustom-models.ent" -- +-- 4. fpi "...Journal Publishing DTD v2.1 20050630" replaced -- +-- "...Journal Publishing DTD v2.1 20050630//EN" -- +-- 5. fpi for display.ent given TWO spaces between the name -- +-- and the version number "Class Elements v2.0" -- +-- -- +-- ********************************************************* -- +-- -- +-- JOURNAL PUBLISHING DTD (BLUE) -- +PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd" + +PUBLIC "-//NLM//DTD Journal Publishing DTD-Specific Modules v2.3 20070202//EN" "journalpubcustom-modules.ent" +PUBLIC "-//NLM//DTD Journal Publishing DTD Customize Classes Module v2.3 20070202//EN" "journalpubcustom-classes.ent" +PUBLIC "-//NLM//DTD Journal Publishing DTD Customize Mixes Module v2.3 20070202//EN" "journalpubcustom-mixes.ent" +PUBLIC "-//NLM//DTD Journal Publishing DTD Customize Content and Attributes Module v2.3 20070202//EN" "journalpubcustom-models.ent" + +PUBLIC "-//NLM//DTD NLM Citation v2.3 20070202//EN" "nlmcitation.ent" + +-- ********************************************************* -- + + +-- JOURNAL ARCHIVING AND INTERCHANGE DTD (Green) -- +PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd" + +-- JOURNAL ARCHIVING AND INTERCHANGE DTD CUSTOMIZATIONS -- + +PUBLIC "-//NLM//DTD Journal Archiving DTD-Specific Modules v2.3 20070202//EN" "archivecustom-modules.ent" +PUBLIC "-//NLM//DTD Journal Archiving DTD Customize Classes Module v2.3 20070202//EN" "archivecustom-classes.ent" +PUBLIC "-//NLM//DTD Journal Archiving DTD Customize Mixes Module v2.3 20070202//EN" "archivecustom-mixes.ent" +PUBLIC "-//NLM//DTD Journal Archiving DTD Customize Content and Attributes Module v2.3 20070202//EN" "archivecustom-models.ent" + +-- ********************************************************* -- +-- -- +-- JOURNAL AUTHORING DTD (PUMPKIN) -- +PUBLIC "-//NLM//DTD Article Authoring DTD v2.3 20070202//EN" "articleauthoring.dtd" + +-- JOURNAL AUTHORING DTD CUSTOMIZATIONS -- + +PUBLIC "-//NLM//DTD Article Authoring DTD-Specific Modules v2.3 20070202//EN" "articleauthcustom-modules.ent" +PUBLIC "-//NLM//DTD Article Authoring DTD Over-ride Classes Module v2.3 20070202//EN" "articleauthcustom-classes.ent" +PUBLIC "-//NLM//DTD Article Authoring DTD Over-ride Mixes Module v2.3 20070202//EN" "articleauthcustom-mixes.ent" +PUBLIC "-//NLM//DTD Article Authoring DTD Over-ride Content and Attribute Module v2.3 20070202//EN" "articleauthcustom-models.ent" +-- Also uses NLM Citation -- + +-- ********************************************************* -- +-- -- +-- NCBI BOOK DTD AND BOOK COLLECTION DTD (PURPLE) -- +PUBLIC "-//NLM//DTD Book DTD v2.3 20070202//EN" "book.dtd" +PUBLIC "-//NLM//DTD NCBI Book Collection DTD v2.3 20070202//EN" "bookcollection.dtd" + +PUBLIC "-//NLM//DTD Book DTD Module of Modules v2.3 20070202//EN" "bookcustom-modules.ent" +PUBLIC "-//NLM//DTD NCBI Book DTD Over-ride Classes Module v2.3 20070202//EN" "bookcustom-classes.ent" +PUBLIC "-//NLM//DTD NCBI Book DTD Over-ride Mixes Module v2.3 20070202//EN" "bookcustom-mixes.ent" +PUBLIC "-//NLM//DTD NCBI Book DTD Content and Attribute Over-ride Module v2.3 20070202//EN" "bookcustom-models.ent" + +PUBLIC "-//NLM//DTD NCBI Book Image Map Elements v2.3 20070202//EN" "bookimagemap.ent" + +PUBLIC "-//NLM//DTD NCBI Book DTD Book Metadata Elements v2.3 20070202//EN" "bookmeta.ent" +PUBLIC "-//NLM//DTD NCBI Book Multi-link Elements v2.3 20070202//EN" "bookmultilink.ent" +PUBLIC "-//NLM//DTD NCBI Book DTD Book Component Metadata Elements v2.3 20070202//EN" "bookpart.ent" +PUBLIC "-//NLM//DTD NCBI Book DTD Book Related Object Element v2.3 20070202//EN" "bookrelated-object.ent" + + +-- NCBI BOOK HISTORICAL BOOK -- +PUBLIC "-//NLM//DTD NCBI Historical Book DTD v2.3 20070202//EN" "historical.dtd" + +PUBLIC "-//NLM//DTD NCBI Historical Book DTD Module of Modules v2.3 20070202//EN" "historical-modules.ent" +PUBLIC "-//NLM//DTD NCBI Historical Book DTD Over-ride Classes Module v2.3 20070202//EN" "historical-classes.ent" +PUBLIC "-//NLM//DTD NCBI Historical Book DTD Over-ride Mixes Module v2.3 20070202//EN" "historical-mixes.ent" +PUBLIC "-//NLM//DTD NCBI Historical Book DTD Content and Attribute Over-ride Module v2.3 20070202//EN" "historical-models.ent" + +PUBLIC "-//NLM//DTD NCBI Historical Book DTD Annotation Elements v2.3 20070202//EN" "annotation.ent" +PUBLIC "-//NLM//DTD NCBI Historical Book DTD Metadata Elements v2.3 20070202//EN" "historicalmeta.ent" +-- Also uses all the book-specific modules -- + +-- ********************************************************* -- + +-- -- +-- ARCHIVING AND INTERCHANGE DTD SUITE -- +-- MODULAR LIBRARY -- +-- DEFINE CLASSES, MIXES, AND MODULES -- +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Module of Modules v2.3 20070202//EN" "modules.ent" +-- -- +-- CLASSES AND MIXES -- +PUBLIC "-//NLM//DTD Default Element Classes Module v2.3 20070202//EN" "default-classes.ent" +PUBLIC "-//NLM//DTD Default Element Mixes Module v2.3 20070202//EN" "default-mixes.ent" +-- -- +-- ELEMENT COMMON (SHARED) ELEMENTS -- +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Common (Shared) Elements Module v2.3 20070202//EN" "common.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Notation Declarations v2.3 20070202//EN" "notat.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite XML Special Characters Module v2.3 20070202//EN" "xmlspecchars.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Custom Special Characters Module v2.3 20070202//EN" "chars.ent" +-- -- +-- DEFINE A CLASS MODULES -- +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Journal Article Metadata Elements v2.3 20070202//EN" "articlemeta.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Back Matter Elements v2.3 20070202//EN" "backmatter.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Display Class Elements v2.3 20070202//EN" "display.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Formatting Element Classes v2.3 20070202//EN" "format.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Journal Metadata Elements v2.3 20070202//EN" "journalmeta.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Link Class Elements v2.3 20070202//EN" "link.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite List Class Elements v2.3 20070202//EN" "list.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Math Class Elements v2.3 20070202//EN" "math.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Paragraph-Like Elements v2.3 20070202//EN" "para.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Subject Phrase Class Elements v2.3 20070202//EN" "phrase.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Bibliographic Reference (Citation) Class Elements v2.3 20070202//EN" "references.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite Section Class Elements v2.3 20070202//EN" "section.ent" +-- -- +-- XHTML TABLE ENTITIES -- +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite XHTML Table Setup Module v2.3 20070202//EN" "XHTMLtablesetup.ent" +PUBLIC "-//W3C//ELEMENTS XHTML Tables 1.0//EN" "xhtml-table-1.mod" +PUBLIC "-//W3C//ENTITIES XHTML Inline Style 1.0//EN" "xhtml-inlstyle-1.mod" +-- -- +-- MATHML MODULES -- +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite MathML Setup Module v2.3 20070202//EN" "mathmlsetup.ent" +PUBLIC "-//W3C//DTD MathML 2.0//EN" "mathml2.dtd" +PUBLIC "-//W3C//ENTITIES MathML 2.0 Qualified Names 1.0//EN" "mathml2-qname-1.mod" +PUBLIC "-//W3C//ENTITIES Extra for MathML 2.0//EN" "mathml/mmlextra.ent" +PUBLIC "-//W3C//ENTITIES Aliases for MathML 2.0//EN" "mathml/mmlalias.ent" +-- -- +-- ISO SPECIAL CHARACTER ENTITIES (MATHML SET) -- +-- -- +-- ENTITIES FROM SGML NOT USED IN MATHML -- +PUBLIC "-//W3C//ENTITIES Greek Letters//EN" "xmlchars/isogrk1.ent" +PUBLIC "-//W3C//ENTITIES Monotoniko Greek//EN" "xmlchars/isogrk2.ent" +PUBLIC "-//W3C//ENTITIES Alternative Greek Symbols//EN" "xmlchars/isogrk4.ent" +-- -- +-- ENTITIES FROM SGML (ISO 8879)USED IN MATHML -- +-- -- +PUBLIC "-//W3C//ENTITIES Box and Line Drawing for MathML 2.0//EN" "iso8879/isobox.ent" +PUBLIC "-//W3C//ENTITIES Russian Cyrillic for MathML 2.0//EN" "iso8879/isocyr1.ent" +PUBLIC "-//W3C//ENTITIES Non-Russian Cyrillic for MathML 2.0//EN" "iso8879/isocyr2.ent" +PUBLIC "-//W3C//ENTITIES Diacritical Marks for MathML 2.0//EN" "iso8879/isodia.ent" +PUBLIC "-//W3C//ENTITIES Added Latin 1 for MathML 2.0//EN" "iso8879/isolat1.ent" +PUBLIC "-//W3C//ENTITIES Added Latin 2 for MathML 2.0//EN" "iso8879/isolat2.ent" +PUBLIC "-//W3C//ENTITIES Numeric and Special Graphic for MathML 2.0//EN" "iso8879/isonum.ent" +PUBLIC "-//W3C//ENTITIES Publishing for MathML 2.0//EN" "iso8879/isopub.ent" +-- -- +-- ENTITIES FROM ISO 9573-13 USED IN MATHML -- +-- -- +PUBLIC "-//W3C//ENTITIES General Technical for MathML 2.0//EN" "iso9573-13/isotech.ent" +PUBLIC "-//W3C//ENTITIES Greek Symbols for MathML 2.0//EN" "iso9573-13/isogrk3.ent" +PUBLIC "-//W3C//ENTITIES Math Alphabets: Script for MathML 2.0//EN" "iso9573-13/isomscr.ent" +PUBLIC "-//W3C//ENTITIES Added Math Symbols: Arrow Relations for MathML 2.0//EN" "iso9573-13/isoamsa.ent" +PUBLIC "-//W3C//ENTITIES Added Math Symbols: Binary Operators for MathML 2.0//EN" "iso9573-13/isoamsb.ent" +PUBLIC "-//W3C//ENTITIES Added Math Symbols: Delimiters for MathML 2.0//EN" "iso9573-13/isoamsc.ent" +PUBLIC "-//W3C//ENTITIES Added Math Symbols: Negated Relations for MathML 2.0//EN" "iso9573-13/isoamsn.ent" +PUBLIC "-//W3C//ENTITIES Added Math Symbols: Ordinary for MathML 2.0//EN" "iso9573-13/isoamso.ent" +PUBLIC "-//W3C//ENTITIES Added Math Symbols: Relations for MathML 2.0//EN" "iso9573-13/isoamsr.ent" +PUBLIC "-//W3C//ENTITIES Math Alphabets: Fraktur for MathML 2.0//EN" "iso9573-13/isomfrk.ent" +PUBLIC "-//W3C//ENTITIES Math Alphabets: Open Face for MathML 2.0//EN" "iso9573-13/isomopf.ent" +-- -- +-- OASIS EXCHANGE TABLE ENTITIES (UNUSED IN VERSION 2.0) -- +PUBLIC "-//OASIS//DTD XML Exchange Table Model 19990315//EN" "oasis-exchange.ent" +PUBLIC "-//NLM//DTD Archiving and Interchange DTD Suite OASIS XML Table Setup Module v1.2 20040830//EN" "oasis-tablesetup.ent" + +-- -- +-- ******** END NLM-NCBI PUBLISHING, ARCHIVING, BOOK, AUTHORING CATALOG **** -- + \ No newline at end of file diff --git a/dtd/journal-publishing-dtd-2.3/chars.ent b/dtd/journal-publishing-dtd-2.3/chars.ent new file mode 100644 index 0000000..86fcf4a --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/chars.ent @@ -0,0 +1,369 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/common.ent b/dtd/journal-publishing-dtd-2.3/common.ent new file mode 100644 index 0000000..299b99e --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/common.entdiff --git a/dtd/journal-publishing-dtd-2.3/default-classes.ent b/dtd/journal-publishing-dtd-2.3/default-classes.ent new file mode 100644 index 0000000..8564b3d --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/default-classes.entdiff --git a/dtd/journal-publishing-dtd-2.3/default-mixes.ent b/dtd/journal-publishing-dtd-2.3/default-mixes.ent new file mode 100644 index 0000000..4646e2c --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/default-mixes.ent @@ -0,0 +1,378 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/display.ent b/dtd/journal-publishing-dtd-2.3/display.ent new file mode 100644 index 0000000..22b773f --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/display.ent @@ -0,0 +1,1637 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dtd/journal-publishing-dtd-2.3/format.ent b/dtd/journal-publishing-dtd-2.3/format.ent new file mode 100644 index 0000000..e562375 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/format.entdiff --git a/dtd/journal-publishing-dtd-2.3/htmltable.dtd b/dtd/journal-publishing-dtd-2.3/htmltable.dtd new file mode 100644 index 0000000..0d0c074 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/htmltable.dtd @@ -0,0 +1,334 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + +]]> + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + +]]> + + + +]]> + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso8879/isobox.ent b/dtd/journal-publishing-dtd-2.3/iso8879/isobox.ent new file mode 100644 index 0000000..05e2b13 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso8879/isobox.ent @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso8879/isocyr1.ent b/dtd/journal-publishing-dtd-2.3/iso8879/isocyr1.ent new file mode 100644 index 0000000..b4149c7 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso8879/isocyr1.ent @@ -0,0 +1,88 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso8879/isocyr2.ent b/dtd/journal-publishing-dtd-2.3/iso8879/isocyr2.ent new file mode 100644 index 0000000..b038bd9 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso8879/isocyr2.ent @@ -0,0 +1,47 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso8879/isodia.ent b/dtd/journal-publishing-dtd-2.3/iso8879/isodia.ent new file mode 100644 index 0000000..39ccfcd --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso8879/isodia.ent @@ -0,0 +1,35 @@ + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso8879/isolat1.ent b/dtd/journal-publishing-dtd-2.3/iso8879/isolat1.ent new file mode 100644 index 0000000..43ae764 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso8879/isolat1.ent @@ -0,0 +1,83 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso8879/isolat2.ent b/dtd/journal-publishing-dtd-2.3/iso8879/isolat2.ent new file mode 100644 index 0000000..c29b828 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso8879/isolat2.ent @@ -0,0 +1,142 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso8879/isonum.ent b/dtd/journal-publishing-dtd-2.3/iso8879/isonum.ent new file mode 100644 index 0000000..79f4380 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso8879/isonum.ent @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso8879/isopub.ent b/dtd/journal-publishing-dtd-2.3/iso8879/isopub.ent new file mode 100644 index 0000000..9b27b63 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso8879/isopub.ent @@ -0,0 +1,105 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsa.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsa.ent new file mode 100644 index 0000000..c413168 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsa.ent @@ -0,0 +1,167 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsb.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsb.ent new file mode 100644 index 0000000..b74414b --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsb.ent @@ -0,0 +1,143 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsc.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsc.ent new file mode 100644 index 0000000..46ea221 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsc.ent @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsn.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsn.ent new file mode 100644 index 0000000..a1df8b7 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsn.ent @@ -0,0 +1,114 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamso.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamso.ent new file mode 100644 index 0000000..f99cf11 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamso.ent @@ -0,0 +1,73 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsr.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsr.ent new file mode 100644 index 0000000..2251ef1 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isoamsr.ent @@ -0,0 +1,204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isogrk3.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isogrk3.ent new file mode 100644 index 0000000..0cbde88 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isogrk3.ent @@ -0,0 +1,64 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isomfrk.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isomfrk.ent new file mode 100644 index 0000000..0e1a943 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isomfrk.ent @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isomopf.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isomopf.ent new file mode 100644 index 0000000..4b26425 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isomopf.ent @@ -0,0 +1,49 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isomscr.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isomscr.ent new file mode 100644 index 0000000..a2174f0 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isomscr.ent @@ -0,0 +1,75 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/iso9573-13/isotech.ent b/dtd/journal-publishing-dtd-2.3/iso9573-13/isotech.ent new file mode 100644 index 0000000..d94c775 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/iso9573-13/isotech.ent @@ -0,0 +1,182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/journalmeta.ent b/dtd/journal-publishing-dtd-2.3/journalmeta.ent new file mode 100644 index 0000000..2886bec --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/journalmeta.ent @@ -0,0 +1,333 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/journalpubcustom-classes.ent b/dtd/journal-publishing-dtd-2.3/journalpubcustom-classes.ent new file mode 100644 index 0000000..9aef200 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/journalpubcustom-classes.ent @@ -0,0 +1,239 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/journalpubcustom-mixes.ent b/dtd/journal-publishing-dtd-2.3/journalpubcustom-mixes.ent new file mode 100644 index 0000000..ec99c3c --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/journalpubcustom-mixes.ent @@ -0,0 +1,216 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/journalpubcustom-models.ent b/dtd/journal-publishing-dtd-2.3/journalpubcustom-models.ent new file mode 100644 index 0000000..d927bbd --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/journalpubcustom-models.entdiff --git a/dtd/journal-publishing-dtd-2.3/journalpubcustom-modules.ent b/dtd/journal-publishing-dtd-2.3/journalpubcustom-modules.ent new file mode 100644 index 0000000..da36ec5 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/journalpubcustom-modules.ent @@ -0,0 +1,142 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/journalpublishing.dtd b/dtd/journal-publishing-dtd-2.3/journalpublishing.dtd new file mode 100644 index 0000000..74fdc4b --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/journalpublishing.dtd @@ -0,0 +1,1152 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%journalpubcustom-modules.ent; + + + + + +%modules.ent; + + + + + + + + + + + + + +%journalpubcustom-classes.ent; + + + + +%default-classes.ent; + + + + +%journalpubcustom-mixes.ent; + + + + +%default-mixes.ent; + + + + +%journalpubcustom-models.ent; + + + + + + + + + + + +%common.ent; + + + + + + + + +%articlemeta.ent; + + + +%backmatter.ent; + + + +%display.ent; + + + + +%format.ent; + + + +%journalmeta.ent; + + + +%link.ent; + + + +%list.ent; + + + +%math.ent; + + + +%nlmcitation.ent; + + + +%para.ent; + + + +%phrase.ent; + + + +%references.ent; + + + +%section.ent; + + + + + + + + + +%mathmlsetup.ent; + + + + + +%XHTMLtablesetup.ent; + + + + +%xmlspecchars.ent; + + + + +%chars.ent; + + + +%notat.ent; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/dtd/journal-publishing-dtd-2.3/link.ent b/dtd/journal-publishing-dtd-2.3/link.ent new file mode 100644 index 0000000..e88eab5 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/link.ent @@ -0,0 +1,545 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/list.ent b/dtd/journal-publishing-dtd-2.3/list.ent new file mode 100644 index 0000000..79b05aa --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/list.ent @@ -0,0 +1,528 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/math.ent b/dtd/journal-publishing-dtd-2.3/math.ent new file mode 100644 index 0000000..15cc331 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/math.ent @@ -0,0 +1,360 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/mathml/mmlalias.ent b/dtd/journal-publishing-dtd-2.3/mathml/mmlalias.ent new file mode 100644 index 0000000..1371af3 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/mathml/mmlalias.entdiff --git a/dtd/journal-publishing-dtd-2.3/mathml/mmlextra.ent b/dtd/journal-publishing-dtd-2.3/mathml/mmlextra.ent new file mode 100644 index 0000000..850c7e7 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/mathml/mmlextra.ent @@ -0,0 +1,122 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/mathml2-qname-1.mod b/dtd/journal-publishing-dtd-2.3/mathml2-qname-1.mod new file mode 100644 index 0000000..821ef3d --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/mathml2-qname-1.mod @@ -0,0 +1,286 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + + +]]> + + + + +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/mathml2.dtd b/dtd/journal-publishing-dtd-2.3/mathml2.dtd new file mode 100644 index 0000000..437ecd8 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/mathml2.dtd @@ -0,0 +1,2206 @@ + + + + + + + + + + +%mathml-qname.modent-isoamsa; + + +%ent-isoamsb; + + +%ent-isoamsc; + + +%ent-isoamsn; + + +%ent-isoamso; + + +%ent-isoamsr; + + +%ent-isogrk3; + + +%ent-isomfrk; + + +%ent-isomopf; + + +%ent-isomscr; + + +%ent-isotech; + + + + +%ent-isobox; + + +%ent-isocyr1; + + +%ent-isocyr2; + + +%ent-isodia; + + +%ent-isolat1; + + +%ent-isolat2; + + +%ent-isonum; + + +%ent-isopub; + + + + +%ent-mmlextra; + + + + +%ent-mmlalias; + +]]> + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/mathmlsetup.ent b/dtd/journal-publishing-dtd-2.3/mathmlsetup.ent new file mode 100644 index 0000000..96386e8 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/mathmlsetup.ent @@ -0,0 +1,285 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%ent-mmlextra; + + + +%ent-mmlalias; + + + + + + +%mathml.dtd; + + + diff --git a/dtd/journal-publishing-dtd-2.3/modules.ent b/dtd/journal-publishing-dtd-2.3/modules.ent new file mode 100644 index 0000000..a028b4e --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/modules.entdiff --git a/dtd/journal-publishing-dtd-2.3/nlmcitation.ent b/dtd/journal-publishing-dtd-2.3/nlmcitation.ent new file mode 100644 index 0000000..be5424c --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/nlmcitation.ent @@ -0,0 +1,133 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/notat.ent b/dtd/journal-publishing-dtd-2.3/notat.ent new file mode 100644 index 0000000..84f1088 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/notat.ent @@ -0,0 +1,181 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/oasis-exchange.ent b/dtd/journal-publishing-dtd-2.3/oasis-exchange.ent new file mode 100644 index 0000000..73b0432 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/oasis-exchange.ent @@ -0,0 +1,354 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/oasis-tablesetup.ent b/dtd/journal-publishing-dtd-2.3/oasis-tablesetup.ent new file mode 100644 index 0000000..5e52c5a --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/oasis-tablesetup.ent @@ -0,0 +1,308 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%oasis-exchange.ent; + + + diff --git a/dtd/journal-publishing-dtd-2.3/para.ent b/dtd/journal-publishing-dtd-2.3/para.ent new file mode 100644 index 0000000..9e6d5d8 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/para.ent @@ -0,0 +1,574 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/phrase.ent b/dtd/journal-publishing-dtd-2.3/phrase.ent new file mode 100644 index 0000000..5eb33df --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/phrase.ent @@ -0,0 +1,377 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/references.ent b/dtd/journal-publishing-dtd-2.3/references.ent new file mode 100644 index 0000000..86179ba --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/references.entdiff --git a/dtd/journal-publishing-dtd-2.3/samplesmall-pub.xml b/dtd/journal-publishing-dtd-2.3/samplesmall-pub.xml new file mode 100644 index 0000000..ec904b9 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/samplesmall-pub.xml @@ -0,0 +1 @@ +
Lapdeb-1 03-647-1 Mulberry Technologies, Inc. ArchivalTest1 Archival DTD Test Article (Just a small one for parsing) Jones John Browning The HonorableIII 2004 Sample-test-only-id 2004

This is a list inside a paragraph:

Poodles

Persian Cats

Weaver Finches

Gechos

This following is an XHTML table, inside a Table Wrapper %lt;table-wrap> wrapper.

XHTML Table

Additional caption material

A cell! Another Still a third>
2 A cell! An xref: See the statement 2 Still a third>
There be statements here.

The IDREFs need to point to something

No Matter How Wise You Get, Wet Birds Don't Fly at Night

Post Hoc Propter Ergo Hoc

We thank you all.

Bibliography

Just a reference or two for testing:

A citation ain't nothing but a sandwich — personal communication Piggy Ms. Can't Help Lovin’ That Frog of Mine; Swine Review, 145: 1224; 2003. Ant-cay Elp-hay Ovin’-lay At-thay Og-fray of-ay Ine-may
\ No newline at end of file diff --git a/dtd/journal-publishing-dtd-2.3/section.ent b/dtd/journal-publishing-dtd-2.3/section.ent new file mode 100644 index 0000000..fbf1793 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/section.ent @@ -0,0 +1,296 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/xhtml-inlstyle-1.mod b/dtd/journal-publishing-dtd-2.3/xhtml-inlstyle-1.mod new file mode 100644 index 0000000..e5016b9 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/xhtml-inlstyle-1.mod @@ -0,0 +1,34 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/dtd/journal-publishing-dtd-2.3/xhtml-table-1.mod b/dtd/journal-publishing-dtd-2.3/xhtml-table-1.mod new file mode 100644 index 0000000..0cf9bcd --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/xhtml-table-1.mod @@ -0,0 +1,333 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +]]> + + + +]]> + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + +]]> + + + +]]> + + + + + + + + +]]> + + + +]]> + + + + + + +]]> + + + +]]> + + \ No newline at end of file diff --git a/dtd/journal-publishing-dtd-2.3/xmlchars/isogrk1.ent b/dtd/journal-publishing-dtd-2.3/xmlchars/isogrk1.ent new file mode 100644 index 0000000..1ed96fa --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/xmlchars/isogrk1.ent @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/xmlchars/isogrk2.ent b/dtd/journal-publishing-dtd-2.3/xmlchars/isogrk2.ent new file mode 100644 index 0000000..d8212b4 --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/xmlchars/isogrk2.ent @@ -0,0 +1,41 @@ + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/xmlchars/isogrk4.ent b/dtd/journal-publishing-dtd-2.3/xmlchars/isogrk4.ent new file mode 100644 index 0000000..097f90e --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/xmlchars/isogrk4.ent @@ -0,0 +1,66 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dtd/journal-publishing-dtd-2.3/xmlspecchars.ent b/dtd/journal-publishing-dtd-2.3/xmlspecchars.ent new file mode 100644 index 0000000..229eaec --- /dev/null +++ b/dtd/journal-publishing-dtd-2.3/xmlspecchars.ent @@ -0,0 +1,342 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +%ISOlat1; +%ISOlat2; +%ISObox; +%ISOdia; +%ISOnum; +%ISOpub; +%ISOcyr1; +%ISOcyr2; + +%ISOgrk1; +%ISOgrk2; +%ISOgrk4; + +%ISOtech; +%ISOgrk3; +%ISOamsa; +%ISOamsb; +%ISOamsc; +%ISOamsn; +%ISOamso; +%ISOamsr; +%ISOmscr; +%ISOmfrk; +%ISOmopf; + + + + + + diff --git a/listing.bash b/listing.bash new file mode 100644 index 0000000..8bcdd81 --- /dev/null +++ b/listing.bash @@ -0,0 +1,19 @@ +#!/bin/bash + +# XMLWF USAGE +xmlwf xml/oup-sample.xml +echo $? +xmlwf xml/bad-formed.xml +echo $? +xmlwf xml/* +echo $? + +# get-doctype USAGE +tools/parse_doctype xml/oup-sample.xml +tools/parse_doctype xml/wiley-sample.xml +tools/parse_doctype xml/parsing-problem.xml +cat tools/node_modules/get-doctype/test/dataset/big.xml | tools/parse_doctype + +# xmlstarlet c14n / val / tr / sel +xmlstarlet c14n xml/wiley-sample.xml > tmp/wiley.xml +xmlstarlet val -e -d dtd/Wileyml3gv20-flat/Wileyml3gv20-flat.dtd tmp/wiley.xml \ No newline at end of file diff --git a/tmp/wiley.xml b/tmp/wiley.xml new file mode 100644 index 0000000..1419fca --- /dev/null +++ b/tmp/wiley.xml @@ -0,0 +1,1775 @@ + +
+ + + Wiley Subscription Services, Inc., A Wiley Company + Hoboken + + 10.1002/(ISSN)1532-2890 + 1532-2882 + 1532-2890 + + + + + Journal of the American Society for Information Science and Technology + J. Am. Soc. Inf. Sci. + + + + Journal of the American Society for Information Science + 0002-8231 + 1097-4571 + 2000 + 51 + 14 + + + + + 10.1002/asi.v62.9 + + 62 + 9 + + September 2011 + + + 10.1002/asi.21591 + + + + + + + + Research Article + Research Articles + + © 2011 ASIS&T + + + + + + + + + + + + + 1696 + 1716 + + + data formats + overlap + markup languges + semantic web + data conversion + + + + + + + + + + + + + A Semantic Web approach to everyday overlapping markup + + + + + Angelo + Di Iorio + + + diiorio@cs.unibo.it + + + + + Silvio + Peroni + + + speroni@cs.unibo.it + + + + + Fabio + Vitali + + + fabio@cs.unibo.it + + + + + + Department of Computer Science, University of Bologna, Bologna, Italy + + + + + Abstract +

Overlapping structures in XML are not symptoms of a misunderstanding of the intrinsic characteristics of a text document nor evidence of extreme scholarly requirements far beyond those needed by the most common XML‐based applications. On the contrary, overlaps have started to appear in a large number of incredibly popular applications hidden under the guise of syntactical tricks to the basic hierarchy of the XML data format. Unfortunately, syntactical tricks have the drawback that the affected structures require complicated workarounds to support even the simplest query or usage. In this article, we present Extremely Annotational Resource Description Framework (RDF) Markup (EARMARK), an approach to overlapping markup that simplifies and streamlines the management of multiple hierarchies on the same content, and provides an approach to sophisticated queries and usages over such structures without the need of ad‐hoc applications, simply by using Semantic Web tools and languages. We compare how relevant tasks (e.g., the identification of the contribution of an author in a word processor document) are of some substantial complexity when using the original data format and become more or less trivial when using EARMARK. We finally evaluate positively the memory and disk requirements of EARMARK documents in comparison to Open Office and Microsoft Word XML‐based formats.

+
+
+
+
+ + +
+ Introduction +

The overwhelming consensus among XML practitioners is that documents are trees, the hierarchy is the fundamental data structure, and violations of the hierarchy are errors or unnecessary complications. Therefore, overlapping markup has received ambivalent, almost schizoid considerations in the field of markup languages. Traditionally, overlaps were the hallmarks of bad HTML coders and nave HTML page editors, taking advantage of the unjustified benevolence in web browsers that would display basically any HTML regardless of proper nesting. At the same time, far from the awareness of the general public, overlaps have been a fringe, almost esoteric, discipline of scholars in the humanities, competently used for arcane specifications of linguistic annotations and literary analysis.

+

Although the first type of overlap was judged with scorn and the second with awe, they both fundamentally represent a situation that is more common than was thought, and the scholars were only more aware, and not more justified, about the need to represent overlaps.

+

Generally, overlap is needed when multiple independent items refer to the same segment, either when considering textual markup documents or multimedia structures (Salembier & Benitez, + 2007). Regarding documents with markup, we need overlap whenever multiple markup elements need to be applied over the same content, and these elements happen to be independent of each other. In some (rather frequent) situations, this independence means that the content referred to by some elements is partially, but not completely, the same as the content referred to by other elements.

+

This situation is more frequent than it may appear: Not only do bad HTML code and arcane linguistic annotations use overlap, but many more mainstream and mundane examples exist. For instance, change tracking in an office document is often at odds with the underlying structure of the text, microformats (Allsopp, + 2007) and Resource Description Framework–in–attributes (RDFa; Adida, Birbeck, McCarron, & Pemberton, + 2008) annotations may need to refer to concepts that span across multiple XML elements, complex data structures (e.g., biological data) force graphs into trees and hide multiple parentage as internal references, and so on.

+

Differently from SGML, which is able to handle some overlapping scenarios through the CONCUR notation (Goldfarb, + 1990), XML grammatically imposes and requires a strict hierarchy of containment generating a single mathematical tree of the document where no overlap is allowed. This requirement has been turned into an intrinsic characteristic of the documents XML was meant to represent rather than a syntactical and conceptual constraint into which these documents need to fit. Thus, whenever authors needed to cope with independent markup elements, they managed either by navely ignoring the hierarchical limitation (and therefore creating invalid documents) or by creating careful workarounds within the syntactical constraint, or even by inventing completely new markup languages that allow some types of overlap. But while new multihierarchical markup languages such as TexMecs (Huitfeldt & Sperberg‐McQueen, + 2003) and LMNL (Tennison & Piez, + 2002) have a small number of adepts and applications, and while bad HTML coders and bad HTML page editors are disappearing from the market, the careful workarounds within the XML syntax (TEI Consortium, + 2005), such as segmentation, milestones, or standoff markup, are to this day frequently used and ubiquitous.

+

All workarounds share the same approach of hiding structural information about a secondary hierarchy under the guise of something else: split individual elements, empty boundary elements, indirect references, and so on. The result is that the secondary structural information is hidden or its importance is lessened to not break or obfuscate the main hierarchy expressed in the visible XML structure. But this comes at a price: Structures specified through workarounds are more difficult to find, identify, and act upon than the are structures in the main XML hierarchy. Thus, trivial searches that should amount to a short XPath in a more direct situation end up being multiple‐lines long, pretty basic visualizations require incredibly complex XSLT stylesheets, specific choices of the main markup hierarchy actually prevent some features of the secondary markup to even exist, and so on. So, although workarounds exist and can be used, hierarchies expressed through them are “second‐class citizens” that cannot fully exploit the sophisticated tools that the XML language provides.

+

In this article, we show how Extremely Annotational Resource Description Framework (RDF) Markup (EARMARK), our proposal for managing overlapping markup, does not generate first‐ and second‐class hierarchies, and allows existing, sophisticated tools to be used on all markup—even in the presence of overlaps. Rather than creating a completely new language requiring completely new tools and competencies, EARMARK uses Semantic Web technologies and Semantic Web tools to obtain many of the results obtainable with usual XML tools.

+

EARMARK defines markup vocabularies by means of OWL ontologies (W3C OWL Working Group, + 2009). Since each individual markup item is an independent assertion over some content or some other assertions, overlaps of content are not a problem, nor are all the issues connected to physical embedding and containments, such as contiguity and document order. Furthermore, by using standard Semantic Web technologies, fairly sophisticated functionalities can be provided over EARMARK documents.

+

Through EARMARK, operations that were previously very hard or impossible exactly because of the interferences of the multiple hierarchies or of the workarounds they employed now become fundamentally trivial since no syntactical tricks are employed and the different hierarchies do not interfere with each other. Thus, for instance, identifying the individual contributions in a multi‐authored MS Word or Open Office document is quite hard on their original XML formats, and becomes trivial when the same documents are converted into EARMARK.

+

This article is an extended version of previous works on EARMARK (Di Iorio, Peroni, & Vitali, + 2010; Peroni & Vitali, + 2009). In those works, we focused on identifying workarounds for overlapping data existing in real XML documents and translating them into EARMARK assertions. We also sketched the EARMARK ontology and presented a simple implementation of EARMARK‐aware tools. This article follows and extends them and also provides some novel contributions: + + +

The systematic analysis of the EARMARK model, with particular attention to data typing and overlapping structures

+ + +

The discussion of further applications for the ontological EARMARK approach. In particular, we show how EARMARK can be used to improve the content filtering and reversions mechanisms of wikis.

+
+ +

The brief description of a process, called ROCCO, for generating EARMARK documents from existing XML documents (even ones that use workarounds for overlapping structures)

+
+ +

An evaluation of EARMARK efficiency when dealing with multiple hierarchies in comparison with the XML structures used by popular XML‐based formats such as Office Open and MS Word.

+
+ +

+

The article is structured as follows: First, we provide a brief overview of existing approaches to overlap using workarounds in XML or ad hoc markup metalanguages, and then give a few examples of situations where overlaps are used today and sometimes in rather mainstream situations. Next, we present the EARMARK model and its rules. Then, we provide some use cases that are meant to demonstrate the superiority of the EARMARK approach to a traditional XML format, especially when overlaps come into question, and show the generation of EARMARK documents, converting legacy documents. An initial evaluation of the efficiency of EARMARK compared to popular XML based data formats such as Open Document (ODT) and Office Open XML (OOXML) is presented, followed by our onclusions.

+
+
+ Existing Approaches to Overlapping +

The need for multiple overlapping structures over documents using markup syntaxes such as XML and SGML is an age‐old issue, and a large amount of literature exists on the techniques, languages, and tools that allow users to create multiple entangled hierarchies over the same content. A good review can be found in DeRose ( + 2004).

+

Some research has proposed using plain hierarchical markup (i.e., XML) and employing specially tailored elements or attributes to express the semantics of overlapping in an implicit way. The TEI Guidelines (TEI Consortium, + 2005) presented a number of different techniques that use SGML/XML constructs to force multiple hierarchies into a single one, including: + + +

milestones (i.e., verlapping structures are expressed through empty elements to mark the boundaries of the “content”),

+ + +

fragmentation (i.e., overlapping structures are split into individual, nonoverlapping elements that may even be linked through id–idref pairs), and

+
+ +

standoff markup (i.e., overlapping structures are placed elsewhere and indirectly refer to their would‐be locations through pointers, locators, and/or id–idref pairs).

+
+ +

+

Given the large number of techniques to deal with overlapping structures in XML, in Marinelli, Vitali, and Zacchiroli ( + 2008), we presented a number of algorithms to convert XML documents with overlapping structures from and to the most common approaches, as well as a prototype implementation.

+

Riggs ( + 2002) introduced a slightly different technique for fragmentation within XML structures. In this proposal, floating elements (i.e., those elements that do not fall in a proper or meaningful hierarchical order) are created using the name of the element followed by an index referring to its semantically related parent element. For example, the floating element <name.person[2]>John</name.person [2] means that <name>John</name> is semantically a child of the second occurrence of the element person, even though the floating element is not structurally contained by its logical parent.

+

Other research even has proposed to get rid of the theory of trees at the base of XML/SGML altogether and use different underlying models and newly invented XML‐like languages that allow the expression of overlaps through some kind of syntactical flourishing. For instance, a general ordered‐descendant directed acyclic graph (GODDAG; Sperberg‐McQueen & Huitfeldt, + 2004) is a family of graph‐theoretical data structures to handle overlapping markup. A GODDAG's nodes represent markup elements and text. Arcs are used to explicitly represent containment and father–child relations. Since multiple arcs can be directed to the same node, overlapping structures can be straightforwardly represented in GODDAG. Full GODDAGs cannot be linearized in any form using embedded markup, but restricted GODDAGs, a subset thereof, can be and have been linearized into TexMecs (Huitfeldt & Sperberg‐McQueen, + 2003), a multihierarchical markup language that also allows full GODDAGs through appropriate nonembedding workarounds such as standoff markup.

+

LMNL (Tennison & Piez, + 2002) is a general data model based on the idea of layered text fragments and ranges, where multiple types of overlap can be modeled using concepts drawn from the mathematical theory of intervals. Multiple serializations of LMNL exist, such as CLIX and LMNL‐syntax.

+

XConcur (Schonefeld & Witt, + 2006) is a similar solution based on the representation of multiple hierarchies within the same document through layers. Strictly related to its predecessor CONCUR as it was included in the SGML, XConcur was developed in conjunction with the validation language XConcur‐CL to handle relationships and constraints between multiple hierarchies.

+

The variant graph approach (Schmidt & Colomb, + 2009) also is based on graph theory. Developed to deal with textual variations that generate multiple versions of the same document with multiple overlapping hierarchies, this theory proposes a new data model to represent literary documents and a graph linearization (based on lists) that scales well even with a large number of versions. The same authors recently presented an extension of their theory that also allows users to merge multiple variants into one document (Schmidt, + 2009). In Portier and Calabretto ( + 2009), a detailed survey about overlapping approaches was presented, and also discussed the MultiX2 data model, which uses W3C standard languages such as XInclude to link and fetch text fragments within overlapping structures, and a prototype editor for the creation of multistructured documents.

+

Tummarello, Morbidoni, and Pierazzo ( + 2005) proposed using RDF as a standoff notation for overlapping structures of XML documents. Since this proposal has many affinities with the one we are presenting in this article, we later discuss its characteristics and compare it with ours.

+
+
+ More Frequent Than One May Think: Overlapping in the Wild +

Overlapping structures have been considered often as appropriate only in highly specific contexts and basically for scholars: The solutions that have been proposed in the literature were complex since they were considered grounded in the intrinsic complexity of the topics themselves. Yet, overlapping structures can be found in many more fields than these, and even mainstream applications generate and use markup with overlapping structures. While the complexity of overlapping is hidden to the final user, applications that consume such data may very well find it rather difficult to handle such information. We next discuss three very different contexts where overlapping already exists and fairly relevant information is encoded in multiple independent structures, leaving to special code the task of managing the complexity.

+
+ Change Tracking in Office Document Formats +

Word processors such as Microsoft Word and Open Office provide users with powerful tools for tracking changes, allowing each individual modification by individual authors to be identified, highlighted, and acted upon (e.g., by accepting or discarding them). The intuitiveness of the relevant interfaces actually hides the complexity of the data format and of the algorithms necessary to handle such information.

+

For instance, the standard ODT format (JTC1/SC34 WG 6, + 2006) used by Open Office, when saving change‐tracking information, relies on two specific constructs for insertions and deletions that may overlap with the structural markup. Adding a few words within a paragraph is not in itself complex, as it does not require the breaking of the fundamental structural hierarchy; conversely, changes that affect the structure itself (e.g., the split of one paragraph in two by the insertion of a return character, or the joining of two paragraphs by the elimination of the intermediate return character) require that annotations are associated to the end of a paragraph and the beginning of the next, in an unavoidably overlapping pattern. ODT uses milestones and standoff markup for insertions and deletions, respectively, and also relies on standoff markup for annotations about the authorship and date of the change.

+

For instance, the insertion of a return character and a few characters in a paragraph creates a structure as follows:

+

+ + + + + + +

+

The empty elements <text:change‐start/> and <text:change‐end/> are milestones marking the beginning and the end, respectively, of the range that constituted the insertion while the element <text:insertion>, before the beginning of the document content, is standoff markup for the metadata about the change (author and date information).

+

Similarly, a deletion creates a structure as follows:

+

+ + + + + + +

+

The element <text:change/> represents a milestone of the location where the deletion took place in the content, and the corresponding standoff markup annotation <text:deletion> contains not only the metadata about the change but also the text that was deleted.

+

The OOXML format (JTC1/SC34 WG 4, + 2008) (the XML‐based format used by Microsoft Office 2007 and standardized by ISO in 2008), on the other hand, uses a form of segmentation to store change‐tracking information across all previous elements involved.

+

+ + + + + + +

+

This heavily simplified version of an OOXML document shows two separate changes: (a) the insertion of a return character and (b) the insertion of a word. These modifications are not considered as a single change; therefore, the segments are not connected to each other but simply created as needed to fit the underlying structure.

+

In fact, change tracking in OOXML is a fairly complex proposition. Although providing more complete coverage of special cases and situations than does ODT, dealing with its intricacies is not for the casual programmer. Even a simple XSLT stylesheet to show inserted text in a different color and hide deleted text may run several hundred lines of code. + 1 + +

+ http://OOXMLdeveloper.org/archive/2006/09/07/625.aspx +

+ +

+
+
+ Overlapping With Microformats +

Microformats (Allsopp, + 2007) add semantic markup to web documents by using common structures of the HTML language itself—in particular, the class attribute.

+

The HTML code is annotated using microformats to provide new semantic, machine‐processable assertions. In the following example, a plain HTML table is enriched with metadata about events + 2 + +

HCalendar, + http://microformats.org/wiki/hcalendar +

+ + and people: + 3 + +

HCard, + http://microformats.org/wiki/hcard +

+
+

+

+ + + + + + +

+

The table was enriched by additional data declaring it to be an event (a conference), and data about the event itself (URL, summary, location) and about four relevant individuals (with their names and roles within the conference) were associated where necessary to the actual content of the table.

+

So far, so good, and no overlap to speak about. Things change dramatically, though, when the overall structure of the main hierarchy (the HTML table) is at odds with the intrinsic hierarchy of the microformat data, such as if the people are organized in columns rather than rows. For instance:

+

+ + + + + + +

+

Unfortunately, vcards are a hierarchy themselves, and if the hierarchy of vcards is organized differently from the hierarchy of the HTML table, as in the latter case, it is just impossible to define the four vcards for the four people organizing the conference. Thus, in plain HTML, the choice of one of two possible presentation models for the main hierarchy of content makes trivial or completely impossible the existence of the second hierarchy.

+

A possible and partial solution to express vcard hierarchies in the latter example is RDFa (Adida et al., + 2008), a W3C recommendation. It describes a mechanism to embed RDF statements into HTML documents by using some HTML attributes (href, rel, rev, content) in combination with other ad hoc attributes (property, about, typeof) proposed in the recommendation itself.

+

+ + + + + + +

+

Since all attributes live in the context of elements, the price to pay is that to assert everything we want to assert, we often need to add some structurally unnecessary elements to the current markup hierarchy of a document, needed only to add the RDF statements (e.g., the span elements emphasized earlier). Even if that does not represent a significant problem for strict Semantic Web theorists, document architects and markup experts see this as a kludge and an inelegant compromise.

+
+
+
+ Wikis: No Overlapping Where Some Should Be +

The strength of wikis lies in their allowing users to modify content at any time. The mechanisms of change‐tracking and rollback that are characteristics of all wikis, in fact, promote users' contributions and make “malicious attacks” pointless in the long run since previous versions can be easily restored.

+

A number of tools exist that automatically discover “wiki vandalisms” and provide users with powerful interfaces to surf changes, identify differences between subsequent versions, and revert content. For instance, Huggle + 4 + +

+ http://en.wikipedia.org/wiki/Wikipedia:Huggle +

+ + is an application dealing with vandalism in Wikipedia, based on a proxy architecture and .NET technologies. A straightforward interface allows users to access any version of a page, highlights contributions of a specific user, and reverts the content to old versions.

+

Even client‐side tools—meant to be installed as browser extensions or bookmarklets—exist to extend the rollback mechanisms of Wikipedia, giving users more flexibility and control over (vandalistic) changes. For instance, Lupin + 5 + +

+ http://en.wikipedia.org/wiki/User:Lupin/Anti‐vandal_tool +

+ + is a set of javascript scripts that check a wiki page against a list of forbidden terms so that authors can identify undesirable modifications and restore previous (i.e., good) versions without a continuous control over the full content of the page; yet again, Twinkle + 6 + +

+ http://en.wikipedia.org/wiki/Wikipedia:Twinkle +

+
+ provides users powerful rollback functions and includes a full library of batch deletion functions, automatic reporting of vandals, and user notification functions.

+

These tools are successful in highlighting vandalism and in identifying versions created by malicious users. However, although it is possible to revert the page to any previous version, all changes (even acceptable ones) that were subsequent to the malicious version cannot be automatically inherited by the restored page.

+

For instance consider Versions V1, V2, and V3 of a wiki page, where Version V1 contains a baseline (i.e., acceptable) content, and Version V2 is identified as a partial vandalism and is agreed to be removed, but Version V3 contains (possibly, in a completely different section than the target of the malicious attack) relevant and useful content that was added before the vandalistic Version V2 was declared as such. The task of removing the modifications of Version V2 while maintaining (whatever is possible of) Version V3 is a difficult, error‐prone, and time‐consuming task if done manually, yet there is no tool we are aware of that automatically filters contributions from multiple versions and merges them into a new one (or, equivalently, removes only selected intermediate versions).

+

However, it is possible to theoretically characterize the interdependencies between subsequent changes to a document. In fact, literature has existed for a long time on exactly these themes (e.g., Durand, + 1994, + 2008). Although a detailed discussion of abstract models of interconnected changes is out of scope for this article (Details and authoritative references can be found in the aforementioned works.), what is relevant in this discussion is that they happen to assume a hierarchical form that is frequently at odds with the hierarchical structure of the content of the document, and as such, most issues derive from the data structures in which content is stored and from the model for manipulating these structures. For instance, the fact that in the wiki perspective each version is an independent unit that shares no content (even unchanged content) with the other versions prevents considering multiple versions as overlapping structures coexisting on the same document. If we were able to make these hierarchies explicit, we would be able to create models and tools to manipulate these documents in a more powerful way and to exploit the existing interconnections between the overlapping hierarchies.

+
+
+ Introduction to EARMARK and Its Support for Overlapping Features +

The presence of hidden overlapping structures—transparent to users, but very difficult to handle by applications—is the common denominator for the scenarios described in the previous section. More than the overlap itself, which cannot be ignored because it does exist and carries important meanings, the problem we face lies in the way applications store such overlapping structures. In the XML world, in fact, the only way to do so is through the use of (complex) workarounds that force the multiple hierarchies into one hierarchy of an XML document. That makes it very tricky to perform sophisticated analysis and searches.

+

This section discusses a different approach to metamarkup, EARMARK (Di Iorio, Peroni, & Vitali, + 2009; Di Iorio et al., + 2010; Peroni & Vitali, + 2009) based on ontologies and Semantic Web technologies. The basic idea is to model EARMARK documents as collections of addressable text fragments, and to associate such text content with OWL assertions that describe structural features as well as semantic properties of (parts of) that content. As a result, EARMARK allows not only documents with single hierarchies (as with XML) but also multiple overlapping hierarchies where the textual content within the markup items belongs to some hierarchies, but not to others. Moreover, EARMARK makes it possible to add semantic annotations to the content though assertions that may overlap with existing ones.

+

One of the advantages of using EARMARK is the capability to access and query documents by using well‐known and widely supported tools for the Semantic Web. In fact, EARMARK assertions are simply RDF assertions while EARMARK documents are modeled through OWL ontologies. The consequence is that query languages (e.g., SPARQL; Garlik & Seaborne, + 2010) and actual existing tools such as Jena + 7 + +

+ http://jena.sourceforge.net +

+ + and Pellet + 8 + +

+ http://pellet.owldl.com +

+
+ can be directly used to deal with even incredibly complicated overlapping structures. What is very difficult (or impossible) to do with traditional XML technologies becomes much easier with these technologies under the EARMARK approach.

+

In the rest of this section, we give a brief overview of the EARMARK model and then describe how EARMARK can be used to deal with the issues presented earlier. The model itself is defined through an OWL document, + 9 + +

+ http://www.essepuntato.it/2008/12/earmark +

+ + summarized in Figure + 1, specifying classes and relationships. We distinguish between ghost classes, which define the general model, and shell classes, which are actually used to create EARMARK instances.

+
+ + + + + + + + +

A UML‐like representation of the EARMARK ontology. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+
+ Ghost Classes +

The ghost classes describe three disjoint base concepts—docuverses, ranges, and markup items—through three different and disjoint OWL classes. + 10 + +

All our OWL samples are presented using the Manchester Syntax (Horridge & Patel‐Schneider, + 2009), which is one of the standard linearization syntaxes of OWL. The prefixes rdfs and xsd refer to RDF Schema and XML Schema namespaces, respectively, while the empty prefix refers to the EARMARK ontology URI plus “#.” Moreover, we use the prefix c to indicate entities taken from an imported ontology made for the SWAN project (Ciccarese et al., + 2008); available at + http://swan.mindinformatics.org/spec/1.2/collections.html +

+ +

+

The textual content of an EARMARK document is conceptually separated from its annotations, and is referred to through the Docuverse class. + 11 + +

This class (and its name) is based on the concept introduced by Ted Nelson ( + 1980) in his Xanadu Project to refer to the collection of text fragments that can be interconnected to each other and transcluded into new documents.

+ + The individuals of this class represent the object of discourse (i.e., all the containers of text of an EARMARK document).

+

+ + + + + + +

+

Any individual of the Docuverse class—commonly called a docuverse (lowercase to distinguish it from the class)—specifies its actual content with the property hasContent.

+

We then define the class Range for any text lying between two locations of a docuverse. A range (i.e., an individual of the class Range) is defined by a starting and an ending location (any literal) of a specific docuverse through the properties begins, ends, and refersTo, respectively.

+

+ + + + + + +

+

There is no restriction on locations used for the begins and ends properties. That is very useful because it allows us to define ranges that follow or reverse the text order of the docuverse to which they refer. For instance, the string “desserts” can be considered both in document order, with the begins location lower than the ends location, or in the opposite order, forming “stressed.” + 12 + +

+ http://en.wikipedia.org/wiki/Palindrome#Semordnilaps +

+ + Thus, the values of the properties' begins and ends define the way a range must be read.

+

The class MarkupItem is the superclass defining artifacts to be interpreted as markup (e.g., elements and attributes).

+

+ + + + + + +

+

A markupitem individual is a collection (c:Set, c:Bag, or c:List, where the latter is a subclass of the second one, and all of them are subclasses of c:Collection) of individuals belonging to the classes MarkupItem and Range. Through these collections, it is possible to define a markup item as a set, a bag, or a list of other markup items, using the properties element (for sets) and item and itemContent (for bags and lists). Thus, it becomes possible to define elements containing nested elements or text, or attributes containing values, as well as overlapping and complex structures. Note also that handling collections directly in OWL allows us to reason about content models for markup items, which would not be possible if we had used the corresponding constructs in RDF. + 13 + +

+ http://hcklab.blogspot.com/2008/12/moving‐towards‐swan‐collections.html +

+ +

+

A markupitem also might have a name, specified in the functional property hasGeneralIdentifier (recalling the SGML term to refer to the name of elements; Goldfarb, + 1990), and a namespace, specified using the functional property hasNamespace. Note that we can have anonymous markup items—as is possible in LMNL (Tennison & Piez, + 2002) and in GODDAG (Sperberg‐McQueen & Huitfeldt, + 2004)—by simply asserting that the item belongs to the class of all those markupitems that do not have a general identifier (i.e., hasGeneralIdentifier exactly 0).

+
+
+ Shell Classes +

The ghost classes discussed so far give us an abstract picture of the EARMARK framework. We need to specialize our model, defining a concrete description of our classes. These new shell subclasses apply specific restrictions to the ghost classes.

+

First, the class Docuverse is restricted to be either a StringDocuverse (i.e., the content is specified by a string) or a URIDocuverse (i.e., the actual content is located at the URI specified).

+

+ + + + + + +

+

Depending on particular scenarios or on the kind of docuverse we are dealing with (plain‐text, XML, LaTeX, a picture, etc.), we need to use different kinds of ranges. Therefore, the class Range has three different subclasses: + + +

PointerRange defines a range by counting characters. In that case, the value of the properties' begins and ends must be a nonnegative integer that identifies unambiguous positions in the character stream, remembering that the value 0 refers to the location immediately before the first character, the value 1 refers to the location after the first character and before the second one, and so on. By using the hasKey OWL property, we also assert that two pointer ranges having equal docuverse and begin and end locations are the same range.

+ + +

XPathRange defines a range considering the whole docuverse or its particular context specifiable through an XPath expression (Berglund et al., + 2007) as value of the property hasXPathContext. Note that by using these ranges, we implicitly admit that the docuverse it refers to must be an XML structure. Moreover, the properties' begins and ends have to be applied on the string value obtained by juxtaposing all the text nodes identified by the XPath. By using the hasKey OWL property, we also assert that two xpath ranges having equal docuverse, XPath context, and begin and end locations are the same range.

+
+ +

XPathPointerRange is an XPathRange in which the value of the properties' begins and ends must be a nonnegative integer that identifies unambiguous positions in the character stream as described for the class PointerRange.

+
+ +

+

+ + + + + + +

+

MarkupItem is specialized in three disjointed subclasses—Element, Attribute, and Comment—that allow a more precise characterization of markup items.

+

+ + + + + + +

+
+
+ Range and Markup Item Overlap +

The presence of overlap in EARMARK is worth discussing in more detail. Different types of overlap exist, according to the subset of items involved, and different strategies are needed to detect them. In particular, there is a clear distinction between overlapping ranges and overlapping markup items.

+

By definition, overlapping ranges are two ranges that refer to the same docuverse, so that at least one of the locations of the first range is contained in the interval described by the locations of the second range (excluding its terminal points). Totally overlapping ranges have the locations of the first range completely contained in the interval of the second range, or vice versa, while partially overlapping ranges have either exactly one location inside the interval and the other outside or identical terminal points in reversed roles.

+

Thus, if we consider the following excerpt:

+

+ + + + + + +

+

we can infer, through a reasoner such as Pellet, that these two ranges overlap by using the following rules:

+

+ + + + + + +

+

where P is one of: + + +

lessThan(b1,e1) ˆ greaterThan(b2,b1) ˆ lessThan(b2,e1)

+ + +

lessThan(b1,e1) ˆ greaterThan(e2,b1) ˆ lessThan(e2,e1)

+
+ +

lessThan(e1,b1) ˆ greaterThan(b2,e1) ˆ lessThan(b2,b1)

+
+ +

lessThan(e1,b1) ˆ greaterThan(e2,e1) ˆ lessThan(e2,b1).

+
+ +

+

The case of overlapping markup items is slightly more complicated. We define that two markup items A and B overlap when at least one of the following sentences holds: + + +

[Overlap by range]: A contains a range that overlaps with another range contained by B.

+ + +

[Overlap by content hierarchy]: A and B contain at least a range in common.

+
+ +

[Overlap by markup hierarchy]: A and B contain at least a markup item in common.

+
+ +

+

The three possible scenarios for such item overlap are summarized in Figure + 2. + 14 + +

The EARMARK documents describing these three overlapping scenarios and all the other ones presented in the following sections are available at + http://www.essepuntato.it/2011/jasist/examples +

+ +

+
+ + + + + + + + +

Three EARMARK examples of overlapping between elements p. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The EARMARK ontology, in fact, is completed by another ontology + 15 + +

+ http://www.essepuntato.it/2011/05/overlapping +

+ + that models all overlapping scenarios, either for ranges or markup items, and includes rules for automatically inferring overlaps through a reasoner.

+
+
+ EARMARK as a Standoff Notation +

If we ignore for a moment the semantic implications of using EARMARK and concentrate on its syntactical aspects only, it is easy to observe that EARMARK is nothing but yet another standoff notation, where the markup specifications point to, rather than contain, the relevant substructure and text fragments.

+

Standoff notations, also known in literature as out‐of‐line notations (TEI Consortium, + 2005), are hardly new, but never really caught on for a number of reasons, most having to do with their perceived fragility under the circumstances of desynchronized modification to the text. In Georg, Schonefeld, Trippel, and Witt ( + 2010) and Bański ( + 2010), we can find a pair of recent and substantially complete analyses of their merits and demerits. In particular, according to Georg, Schonefeld, Trippel, and Witt ( + 2010), “standoff annotation has … quite a few disadvantages: + + +

very difficult to read for humans

+ + +

The information, although included, is difficult to access using generic methods.

+
+ +

Limited software support as standard parsing or editing software cannot be employed.

+
+ +

Standard document grammars can be used only for the level which contains both markup and textual data.

+
+ +

New layers require a separate interpretation.

+
+ +

Layers, although separate, often depend on each other.” + 16 + +

To individually address the issues, we edited the original bullets into a numbered list.

+ +

+
+ +

+

And yet, although EARMARK is in practice a standoff notation, it provides a number of workarounds to most of the aforementioned issues.

+

First, since EARMARK is based on OWL and can be linearized in any of the large number of OWL caricaturization syntaxes, it follows that (a) readability, (b) access, and (c) software support for it are exactly those existing for well‐known, widespread, and important W3C standards such as RDF and OWL. Being able to employ common RDF and OWL tools such as Jena and SPARQL for EARMARK documents was in fact a major motivation for it.

+

Issue 4 should be examined beyond the mere validation against document grammars and toward a general evaluation of the level of compliancy of the markup to some formally specified expectations. EARMARK documents, while being subject to no document grammar in the stricter XML sense, allow the specification of any number of constraints, expressed either directly in OWL or SWRL (Horrocks et al., + 2004), or even in SPARQL, that trigger or generate validity evaluations. In Di Iorio, Peroni, and Vitali (in press), we tried to show that a large number of requirements, from hierarchical well‐formedness in the XML sense, to validation requirements in terms of XML DTDs, to adherence to design patterns, can be expressed satisfactorily using these technologies.

+

Issue 5 regards the difficulty of standoff notations to provide interlayer analysis on XML structures: Separate interpretation of markup layers is easy, but identification and validation of overlapping situations are more complex: Standoff markup is mainly composed of pointers to content and does not have any direct way to determine overlap locations without some kind of pointer arithmetics to compute them. Validation of contexts allowing overlaps as describable using rabbit/duck grammars (Sperberg‐McQueen, + 2006) also is not trivial. In this regard, EARMARK provides yet again a solution that does not require special tools: Although OWL does not allow direct pointer arithmetics, SWRL on the contrary does, as shown earlier where we described a batch of (SWRL‐implementable) rules that do, in fact, determine overlapping locations on EARMARK documents with good efficiency.

+

Finally, Issue 6 refers to the fact that evolution of separate markup annotation layers needs to synchronously take place, lest one of them becomes misaligned with the new state of the document. This is, in summary, the fragility of pointers, which can be considered the fundamental weakness of standoff, as well as of any notation that has markup separate from its content: If a modification occurs to the underlying (probably text‐based) source, all standoff pointers that could not be updated at the same time of the change become outdated and possibly wrong. All standoff notations fall prey to this weakness, and there is no way to completely get rid of it.

+

What is possible is to identify exactly the conditions under which such weakness acts, and see if there is a way to reduce the mere frequency of such events. In fact, for a standoff pointer to become outdated, several conditions must take place at the same time: + + +

The standoff notation must be used as a storage format, rather than just as a processing format;

+ + +

the source document must make sense even without the additional standoff markup (i.e., the standoff notation contains no information that is necessary for at least some types of document modifications);

+
+ +

the source document must be editable (and, in fact, must be edited) on its own;

+
+ +

the standoff pointers must rely on positions that change when the source is edited (e.g., character‐based locations);

+
+ +

editing must be done in contexts and with tools that cannot or do not update the standoff pointers; and

+
+ +

there must be no computable way to determine the modifications of the document (e.g., via a diff between the old and new versions).

+
+ +

+

Of course, no standoff notation can rule out that these conditions occur on their documents, but note that all six of them must occur for standoff pointers to become outdated. EARMARK is not safe from these occurrences either, but at least for the use cases here described, one or more of these conditions simply do not apply: EARMARK is mostly used as a processing format, with no need to save it on disk (Conversion from the source formats such as MS Word is described later and does not require special storage.), the data format described is either in a very specific format (e.g., MS Word or ODT) that in fact already does handle internally its data changes and requires the overlapping data exactly for this purpose, or is in fact the result of a diff action on successive versions of a document (as in the case of the wiki pages). Finally, EARMARK allows references to relatively stable fragment ids of the documents (by using XPath ranges without specifying explicitly begin and end locations) rather than the extremely fragile character locations, further reducing the chances of outdated pointers. For this reason, without being able to completely rule out the possibility of standoff pointers going wrong, we tend to consider it as a significantly little risk, at least for the use case described here.

+
+
+ Using OWL Versus RDF for Standoff Notations +

EARMARK is strongly based on OWL 2 DL (W3C OWL Working Group, + 2009) to express multiple markup layers with possible overlapping ranges over the same content. OWL 2 DL is not the only possible choice for expressing standoff notations via Semantic Web technologies. In fact, RDF is another valid and effective model for dealing with the same issue, as shown in Tummarello et al. ( + 2005), by means of the open‐source application programming interface (API) RDF Textual Encoding Framework (RDFTef). This API was created to demonstrate a plausible way for handling overlapping markup within documents and identifying textual content of a document as a set of independent RDF resources that can be linked mutually and with other parent resources.

+

Besides giving the possibility to define multiple structural markup hierarchies over the same text content, the use of RDF as the language for encoding markup allows to specify semantic data on textual content as well. But the real main advantage in using RDF is the possibility of using particular built‐in resources appositely defined in the RDF syntax specification (Beckett, + 2004) for describing and dealing with different kinds of containers, either ordered (rdf:Seq) or unordered (rdf:Bag). Thus, RDF resources can be used to represent every printable element in the text—words, punctuation, characters, typographical symbols, and so on—while RDF containers also can be used to combine such fragments and containers.

+

Although RDF is not sufficient to define a formal vocabulary for structural markup, does a given resource represent an element, an attribute, a comment, or a text node? In which way is a resource of a certain type related to others? The specification of an RDFS (Brickley & Guha, + 2004) or of an OWL layer can successfully address these issues. Hybrid solutions obtained by mixing different models, even when they are built one upon another, may seem elegant, but not necessarily the best choice. In fact, there exist well‐known interoperability limits between OWL 2 DL and RDF that prevent the correct use of Semantic Web tools and technologies. In particular: + + +

Any markup document made using RDF containers (e.g., to describe what markup items contain and in which order) and OWL ontologies (e.g., to define classes of markup entities and their semantics) results in a set of axioms that end up outside of OWL DL and well within OWL Full, which limits the applicability of the most frequently used Semantic Web tools that are usually built upon the (computationally tractable) description logic underlying OWL 2 DL.

+ + +

The individual analysis of each language may be not applicable when we have to check particular properties that lie between RDF and OWL layers. For example, verifying the validity of a markup document against a particular schema, which is one of the most common activities with markup, needs to be made to work with both markup item structures (that would be defined in RDF) and logical constraints about classes of markup items (e.g., elements only, attributes only, the element “p,” all the element of a particular namespace, etc., all of them definable in OWL).

+
+ +

+

Being able to express everything we need directly in OWL quite straightforwardly addresses both issues. The well‐known absence of containers and sequences in OWL can be overcome by modeling classes in specific ways using specific design patterns such as those in Ciccarese et al. ( + 2008) and in Drummond et al. ( + 2006).

+
+
+
+ Using EARMARK +

There are multiple applications for the EARMARK approach. The most interesting for this article is its capability of dealing with overlapping structures in an elegant and straightforward manner. Under EARMARK, such structures do not need to be specified through complex workarounds as with XML, but they are explicit and can be easily described and accessed. Sophisticated searches and content manipulations become very simple when using this ontological model.

+

The goal of this section is to demonstrate the soundness and applicability of EARMARK by discussing how the use cases presented earlier are addressed. Note that throughout the section we investigate multiple EARMARK data structures and documents, focusing on the feasibility and potentiality of such an ontological representation.

+
+ Looking for Authorial Changes in Office Documents +

The discussion about change tracking in office document formats showed that both ODT (OpenOffice format) and OOXML (Microsoft Word format) use complex data structures to store overlaps generated by change‐tracking functionalities. These structures make it very difficult to search and manipulate the content when using XML languages and tools. Even very simple edits generate a rather tangled set of overlapping elements.

+

Let us recall the example mentioned earlier where the user “John Smith” splits a single paragraph into two. The ODT representation is:

+

+ + + + + + +

+

The OOXML representation (shown earlier) is even more complex. In fact, these formats exploit in large scale (tangled) fragmentation (OOXML) or milestones and stand‐off markup (ODT) to deal with overlaps.

+

EARMARK, on the other hand, stores overlapping data in a direct and streamlined manner that does not require tools to rebuild information from the twists of a tree‐based XML structure. The information already is available and expressed through consistent RDF and OWL statements. Figure + 3 graphically shows the corresponding EARMARK document.

+
+ + + + + + + + +

Encoding in EARMARK the ODT change‐tracking example. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The original paragraph content and the new string “also” are now encoded as two docuverses over which the ranges r1, r2, and r3 are defined. The original paragraph is then composed of the (content of) ranges r1 and r2 while the paragraphs resulting after the (text and carriage return) insertion now comprise range r1 and ranges r2 and r3, respectively. Metadata about the author and the modification date are encoded as further RDF statements.

+

+ + + + + + +

+

The advantages of streamlining overlaps becomes apparent if we consider tasks a little beyond the mere display. For instance, the query for “the textual content of all paragraphs inserted by John Smith” ends up rather entangled if we used XPath on the ODT structure. The process for finding that textual content needs to browse the current version of the document and look for all the text:change‐start/text:change‐end pairs that refer to an insertion made by John Smith involving the creation of a new paragraph (i.e., text:change‐start is in a first paragraph while its pair, text:change‐end, is in the following one) that are either currently present in the document body or hidden behind a subsequent deletion made by someone else. Once the paragraphs are identified, we need to retrieve the content that originally was contained there (i.e., the text fragments that still are within those boundaries or that may have been deleted in subsequent versions). The following XPath represenst an implementation of this process:

+

+ + + + + + +

+

The XML structure of an MS Word file, using segmentation rather than milestones, does simplify the query a bit, but still presents some radical complexities. The process starts by choosing all those w:p elements that were inserted by John Smith as well as all their previous and contiguous w:p elements that were deleted before or inserted after the first ones. In OOXML, each sequence of contiguous w:p elements implicitly represents one paragraph. Therefore, we can now take all the text fragments contained in each w:p sequence that were inserted before or deleted after the paragraph defined by the sequence itself. The following is the resulting XPath for an OOXML document.

+

+ + + + + + +

+

The complexity of both XPath queries is due to the intrinsic complexity of the data structure on which the query has to work. Although the interface of OpenOffice or MS Word may provide tools to directly deal with these queries using specific strategies on the internal data structures, applications working directly on the XML structure have very little help in disentangling the mess of the data formats.

+

On the other hand, since EARMARK documents are actually OWL files, it is possible to access and query them with plain Semantic Web tools. Powerful searches then can be performed without using niche‐specific tools or complex and long XPath expressions simply with mainstream technologies such as SPARQL (Garlik & Seaborne, + 2010).

+

The corresponding SPARQL query for (“the textual content of all paragraphs inserted by John Smith”) therefore can be written as follows:

+

+ + + + + + +

+

But EARMARK is useful for even more than querying: EARMARK also decreases the costs, in terms of efforts and lines of code, for manipulating documents.

+

Let us consider the task of generating an intermediate version (i.e., neither the first nor the last one of a version chain) from a document that includes change‐tracking information about the whole document history.

+

The process of rebuilding these versions by working on the XML structure without specific APIs is both complex and inefficient. For example, a basic XSLT that returns an XML document defining the desired version requires to at least: + + +

define templates for all the elements actively involved in the change tracking (e.g., for ODT, text:changed‐region, text:change‐start, text:change‐end, and text:change and similarly for OOXML) to understand, by looking at their creation date, whether they must be considered or ignored when building the requested version. In particular, we must exclude insertions following and deletions preceding the version we are building;

+ + +

define templates for paragraphs to handle cases where the paragraph is the result of an insertion or a deletion of other paragraphs to identify whether it should be considered for the result and, in such case, finding out its real text content and remembering that in the following versions, such content may have spread out among other paragraphs;

+
+ +

define templates for handling insertions/deletions for structures such as images, sections, lists, and tables; and

+
+ +

define an identity template for the other elements to visit the entire document.

+
+ +

+

Even the most basic and incomplete implementation of such XSLT requires hundreds of lines of complex and convoluted code and a large number of ad hoc decisions based on the specificities of whether we start from ODT or OOXML. Note also that a Java‐based implementation (or in any other procedural language) of the same process would be equally or even more complex.

+

The same result can be achieved on EARMARK documents with a few lines of Java code:

+

+ + + + + + +

+

This approach uses the EARMARK Java API + 17 + +

+ http://earmark.sourceforge.net +

+ + and a single SPARQL query, runnable on any SPARQL 1.1 processor such as Jena, to identify the root node of the subtree of the version that is associated with the specified date and creator. Then, it performs a simple, recursive, deep‐first visit to clone all the nodes in the tree and to combine them in the output EARMARK document.

+

This method heavily uses Semantic Web technologies on the structures provided by EARMARK whose characteristics are always explicit and clear. In fact, since all versions coexist within the EARMARK document and each version can be encoded explicitly as a tree within the overall graph, this operation is straightforward and fast.

+
+
+ Improving Semantic Annotations +

EARMARK also can be exploited to improve semantic annotations. As noted earlier, there are in fact strong limitations in the same process of annotating web documents with semantic structures that overlap the structural ones. The same example—of vcards that cannot be created on the top of tables organized per rows—will be used in this section.

+

We solve this by converting the web document with annotations into an EARMARK document, allowing both semantic and structural annotations to coexist. Through EARMARK, we can explicitly express both markup structures and vcard assertions. Figure + 4 shows how the vcard example can be modeled (Once again, we show a graphical representation for the sake of clarity.)

+
+ + + + + + + + +

The abstract model of the EARMARK document solving the microformats issue. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The textual content of the original table cells is now encoded in two different docuverses: one for the header (with roles) and one for the body (with names of committee members). Ranges r1, r2, …, r8 are then created to distinguish each role and name. Two independent and coexisting hierarchies are then built on top of the same set of ranges: the HTML table that includes one cell for each range (in blue) and the vcards about each person (in green) that include only the relevant ranges and overlap the previous one. Note also that the vcards are defined in such a way that does not interfere with the structural features of the table. The full linearization in OWL of this example can be found at + http://www.essepuntato.it/2011/jasist/examples +

+
+
+ Improving Wiki Content Reversions +

EARMARK can be used to improve wiki reversion mechanisms and overcome the limitations discussed earlier: The automatic filtering and merging of contributions from multiple versions of the same page are still a manual process, but it can be fully automatized if the overlapping structures buried in the whole history of the page become explicit.

+

The role of EARMARK is to make those structures explicit and available for more sophisticated content manipulation. To understand the extent EARMARK structures can be derived from wikis and how they can be exploited by the final users, we use as our example the wiki platform MediaWiki + 18 + +

+ http://www.mediawiki.org +

+ + (i.e., the wiki engine of Wikipedia).

+

MediaWiki offers sophisticated functionalities for creating diffs of wiki content. Users can compare any two revisions in the page history and highlight changes in a friendly interface that shows modifications with a word‐level granularity. Diff pages contain metadata about each compared version (when the version was created, who the author was, or which IP address an anonymous author was connected from, etc.) and a two‐column table showing the changes side by side. Changes are detected a posteriori by comparing two arbitrary versions, which are not even required to be temporally contiguous.

+

The output of the MediaWiki diff engine has regularities that can be exploited to automatically build the overlapping structures of the diff and to express them in EARMARK. Let us consider a fictitious example summarized in Table + 1, where an initial text is revised three times by different authors.

+ + + All the versions of a wiki page modified by different authors. + + + + + + + + + + Version Author + V1 151.61.3.122 + V2 Angelo Di Iorio + V3 Silvio Peroni + V4 Fabio Vitali + + + + + Content + Bob was farming carrots and tomatoes. + Bob was farming carrots, tomatoes and beans. + Bob was farming carrots, tomatoes and green beans. They were all tasteful. + Bob was farming carrots, tomatoes and green beans. [new paragraph] They were all tasteful. + + + +
+
+

To display the differences between V1 ad V2, Mediawiki creates a page whose HTML code is as follows: + 19 + +

For the sake of clarity, we removed all markup irrelevant to our discussion.

+ +

+

+ + + + + + +

+

This is an HTML table of two rows, the first showing metadata (date and author of the modification), and the second showing the actual modifications. The first cell of the second row contains all the unmodified text and a del element for each inline fragment that was deleted. The second cell contains all the unmodified text and an ins element for each inline fragment that was inserted. Thus, these cells share exactly the same unmodified part(s) of the two compared versions.

+

When the structure itself is modified rather than merely the text, the source code of the MediaWiki diff is slightly different. Thus, the diff between V3 and V4 (which splits a paragraph in two) is as follows:

+

+ + + + + + +

+

The diff output is not complete or sophisticated, and of course, it is a completely different task to replan such an algorithm (but for a first idea of natural changes in diffing XML documents, see Di Iorio, Marchetti, Schirinzi, & Vitali, + 2009). Thus, limitations of that algorithm are inevitably shared by any EARMARK representation. Yet, this output is sufficiently rich to allow us to extract the overlapping information we need. For instance, the insertion of a nonbreaking space or a carriage return generates rows according to specific rules that can be easily detected to capture the actual change by the author.

+

Figure + 5 shows the aforementioned example rebuilt in EARMARK. All versions are encoded in the same document by creating overlapping assertions over the docuverses. Metadata and RDF statements are layered on top of those assertions and create a rich knowledge‐base about the history of the documents and, in particular, about the history of each fragment.

+
+ + + + + + + + +

The wiki sample versions encoded in a single EARMARK document. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

Due to the complexity of the example, we labeled arrows with numbers that indicate the position of each range within each markup item. Consider, for instance, Version V4: It is composed of two DIV elements, the first one containing the concatenation of “Bob was farming carrots” + “,” + “tomatoes” + “and” + “green” + “beans” + “.” and the second one containing the string “They are all tasteful.”

+

Implementing a wiki content‐filtering mechanism on top of such a structure is rather simple. For instance, the removal of all the contributions of “Angelo Di Iorio” that leaves untouched all the content written (previously and subsequently) by “Silvio Peroni” and “Fabio Vitali” can be performed straightforwardly. Three steps are enough to apply such an intermediate content reversion: + + +

the identification of the fragments written by “Angelo Di Iorio,” which is a straightforward SPARQL query on the embedded statements;

+ + +

the creation of a new version where references to those fragments are removed and references to fragments no longer in the document are correctly fixed;

+
+ +

the translation of that document into an actual MediaWiki page through the serialization process described in Peroni and Vitali ( + 2009).

+
+ +

+

Of course, an automatic process may generate ambiguities or even errors in the resulting content (i.e., some parts may become dangling, wrong, or unclear after removing text fragments elsewhere); grammar discrepancies also might be generated by the same approach. Linguistic and semantic problems, however, become a problem once the technical issues of managing independent, yet successive, edits are solved. What is important is that all the information about overlaps and dependencies among fragments is available in EARMARK and can easily be searched, filtered, and manipulated. Besides, foreseeing a manual intervention for checking and polishing automatically filtered content is perfectly in line with the wiki philosophy, so that the wiki community itself can wisely use the reversion tools to revise the content and adjust any intervening minor nuisances or imperfections. Such checks would still be far simpler and faster than would the manual process of partially reverting versions as we have today.

+
+
+
+ Generating EARMARK From Existing Documents: The ROCCO Approach +

Since we do not expect documents to be natively written in EARMARK or manually created by users, we need a way to extract EARMARK data structures from existing XML‐based resources, which is trivial when the XML is simple and clearly hierarchical and slightly more complex when the XML contains workarounds to force an intrinsically overlapping situation into a single hierarchy.

+

We designed a reliable process to transform XML files into EARMARK documents that fully captures overlapping structures even when the overlaps are hidden in one the many well‐known workarounds. This approach takes as input an XML file and produces the corresponding EARMARK document in five steps: Read, Overhaul, Convert, Classify, and Organize (hence, the name ROCCO).

+

Since ROCCO is not the main topic of this article, we very briefly discuss the issue of converting XML into EARMARK, explaining how each step works. The ROCCO algorithm performs five steps, described next.

+
+ Read and Overhaul +

The first two steps consist of loading the XML source file and, if needed, adding information useful for further processing. In EARMARK, there is a clear distinction between the textual content of a document and the structures built on top of it: The content is stored as plain text—within docuverses—and all structures are externalized and expressed through OWL and RDF assertions.

+

While OpenOffice stores all overlapping structures in the main document file, some other editors (e.g., MS Word) store overlaps in many different ways, even in a separate file. The overhaul step extracts such data and adds them to the main content document by exploiting format‐specific procedures, implemented via XSLT in most cases.

+
+
+ Convert +

The subsequent step consists of converting the XML source file into an early EARMARK document that expresses exactly the same information and hierarchies. No interpretation or disentanglement of workarounds is performed at this step.

+

Since the input is XML, this translation can be performed directly via a generic XSLT stylesheet. It basically consists of a recursive algorithm that parses the source file and generates the corresponding instances in the EARMARK ontology. Such a translation is straightforward and not difficult.

+
+
+ Classify +

The “Classify” step extends the EARMARK document built so far with information about the workarounds used to encode overlaps. That information will be exploited in the subsequent steps to make those overlaps explicit.

+

The basic idea is to exploit OWL reasoners to detect workarounds in an early EARMARK document D by: + + +

defining an ontology O that models all the workarounds used by applications, such as milestones, stand‐off markup, etc.; these workarounds are specific to the data format used in the source document;

+ + +

specifying the EARMARK document D as an ABox for the ontology O;

+
+ +

defining SWRL rules that capture the role of each element in D and check relationships between elements;

+
+ +

running an OWL reasoner, such as Pellet, on D+O to create new OWL instances and properties that identify which workarounds are present.

+
+ +

+

The actual detection of workarounds is delegated to an external reasoner. Refining detection strategies and even adding new strategies for new formats all can be done via OWL and SPARQL. Indeed, tricky issues need to be addressed—mostly depending on the idiosyncrasies of the original formats—but no procedural code is required.

+
+
+ Organize +

The final step consists of building yet another EARMARK document that expresses the overlaps and metadata in an explicit way, based on the information collected by the previous steps. This phase consists of mapping operations from the native format into the EARMARK structure. Such conversion relies on the identification of metadata to classify the operations and to externalize relevant metadata in separate RDF statements.

+
+
+
+ Evaluating EARMARK +

One of the most frequent criticisms when proposing a different approach to solving a well‐known problem in information and communication technology is that the new solution may simplify the difficulties of the specific problem, but brings with it hidden costs in terms of size of the data structure, computation efforts, or conversion restrictions that compensate the advantages. In our case, one of the anonymous reviewers of our article (Di Iorio, Peroni, & Vitali, + 2009) wondered whether a difference in file size could weigh in on the convenience of adopting EARMARK as opposed to working with the original files.

+

As such, a discussion of cost functions of EARMARK versus other formats is in order. Yet, a systematic discussion of the relative costs (e.g., in byte size) of some original XML‐based data structures versus their EARMARK equivalent is an open‐ended undertaking that heavily depends on the original XML data structure and the specific features present in the document, and is badly defined anyway: While XML is a linearization format immediately expressible in actual bytes, OWL (or more precisely, RDF, the language in which OWL ontologies are expressed) is an abstract structure that allows a large number of linearization formats (including XML itself) with corresponding huge differences in the final byte counts.

+

For these reasons, to provide at least an initial test of meaningful concepts, we selected two XML‐based data formats (OOXML and ODT) and, specifically, a set of documents where overlapping tricks were present (i.e., where change‐tracking was active). To bypass the size discussion, we decided not to test byte lengths (which are not meaningful and easily skewed, e.g., by reducing the string length of the element names or of the class names) but the number of nodes for XML documents and of triples for OWL documents. This comparison again is not particularly appropriate (Triples are naturally numerous in OWL ontologies, and it is customary to deal with hundreds of thousands and even millions of assertions in Semantic Web applications.) but closer to meaningfulness than is the mere byte count.

+

Our comparison was carried on a small set of documents in ODT and OOXML that included change‐tracking information. As discussed in the previous sections, change‐tracking facilities generate rather complex overlaps even for basic operations on small text fragments, which in turn are expressed as a potentially huge number of standoffs and milestone markup within the XML hierarchy. The same documents were individually converted into EARMARK. We then charted how simple edits under change‐tracking affect the number of nodes in XML formats and of statements in OWL files. + 20 + +

The full details about each version and each format also are available at + http://www.essepuntato.it/2011/jasist/discussion +

+ +

+

We created seven different versions, named after the “Seven Dwarfs” for recognizability, by applying very common edits (e.g., the insertion of few words, the deletion of some sentences, the split of a paragraph, etc.) on a small document, creating multiple overlaps. Figure + 6 shows the results of our comparison.

+
+ + + + + + + + +

A graph summarizing the results of the first experiment. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The overall trend is interesting and comforting: While in simple documents with no overlap the node count of XML is lower than is the assertion count of EARMARK triples, the presence of overlaps makes EARMARK and XML formats comparable. The growth of EARMARK statements is in fact very close to the growth of XML nodes when the number of overlaps increases. EARMARK is even more efficient than is XML for more complex documents.

+

The measure for each format was done by counting only those nodes and statements instrumental to encode content and (overlapping) structures: We did not take into account either the presentational information for ODT and OOXML (Each file, for instance, includes a very long list of style definitions that are not relevant for the purposes of our analysis.) or namespace declarations (OOXML files, for instance, list all relevant namespaces for the Office toolkit.) or ignorable white spaces (that are only added to indent content and improve readability).

+

Interestingly, EARMARK and ODT show a very similar increase in size while OOXML is much more verbose and grows faster. The content of the first version, for instance, is encoded using four nodes in ODT, 13 statements in EARMARK, and 54 nodes in OOXML; the last one contains 241 ODT nodes, 233 EARMARK statements, and 452 OOXML nodes. To return to our original inquiry, it is clear that the weight of EARMARK documents is very good compared to the other ones.

+

Also note the regularity in the growth of EARMARK statements. Regardless of the actual modifications applied to the document, in fact, EARMARK adds about 40 statements for each edit. Both OOXML and ODT, on the contrary, show a more irregular “pace.” The reason for this is that EARMARK externalizes all assertions, so that all modifications (either to leaf nodes or to intermediate nodes in the original XML) are “flattened” onto the docuverses and do not depend on the complexity of the structure within which the edit took place.

+

Figure + 7 shows the results of a similar comparison on a different set of documents and edits. We collected seven versions named after the days of the week and created by seven different authors when editing a very simple document. The overall trend does not change, and shows that EARMARK and ODT again have a comparable behavior, far better than that of OOXML.

+
+ + + + + + + + +

A graph summarizing the results of the second experiment. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

In conclusion, although preliminary, this study shows clear trends of a very conservative behavior of EARMARK with respect to document size.

+
+
+ Conclusions +

Overlaps, far from being an obscure requirement for sophisticated functionalities of arcane markup languages, are a very frequent undertaking even in major data formats and in rather frequent situations. Yet, since the XML language does not allow them, consciously or not, designers of data formats have adopted a huge and entangled array of tricks, special cases, and workarounds that, although solving the actual problem of storing overlapping structures, open new and complicated ones when approaching even basic chores on documents containing them, such as queries.

+

The EARMARK approach drastically reduces the efforts needed to perform such chores on overlapping structures since it does not allow the corresponding multiple trees to actually entangle and complicate the job. EARMARK is radically different from both special markup metalanguages that allow overlaps and the introduction of workarounds within the traditional tree‐oriented XML language because it treats multiple trees over the same content as first‐class citizens of the language, yet uses well‐known and standard W3C technologies and languages to perform all tasks. EARMARK documents, at the end, are OWL ontologies. Thus, any Semantic Web technology (e.g., SPARQL) can be used straightforwardly to perform operations on their content.

+

Improving queries is not the only application of EARMARK. Validation is another interesting field that we are investigating. In fact, the same ontological framework can be used to prove properties concerning a document, such as validity against a schema, compliance to co‐constraint specifications, or adherence to structural patterns. Moreover, inspired by Marcoux and Rizkallah ( + 2009), in which they described an approach for defining natural‐language semantics for XML‐based languages, we also are developing an ontology‐based approach for encoding markup semantics—that is, the formal definition of meanings of markup elements, besides the syntactical structure of a markup document—within EARMARK documents.

+
+ + References + + + + Adida, + B. + , + + Birbeck, + M. + , + + McCarron, + S. + , & + + Pemberton, + S. + + ( + 2008). RDFa in XHTML: Syntax and processing. W3C Recommendation, October 14, 2008, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/rdfa‐syntax/ + + + + + + Allsopp, + J. + + ( + 2007). + Microformats: Empowering your markup for Web 2.0. + New York, NY: + Friends of ED Press. + + + + + + Bański, + P. + + ( + 2010). + Why TEI stand‐off annotation doesn't quite work: And why you might want to use it nevertheless. In + Proceedings of Balisage: The Markup Conference 2010. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://www.balisage.net/Proceedings/vol5/html/Banski01/BalisageVol5‐Banski01.html + + + + + + Beckett, + D. + + ( + 2004). RDF/XML syntax specification (Rev.). W3C Recommendation, February 10, 2004, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/2004/REC‐rdf‐syntax‐grammar‐20040210/ + + + + + + Berglund, + A. + , + + Boag, + S. + , + + Chamberlin, + D. + , + + Fernández, + M.F. + , + + Kay, + M. + , + + Robie, + J. + , & + + Siméon, + J. + + ( + 2007). XML Path Language (XPath) 2.0. W3C Recommendation, January 23, 2007, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/xpath20/ + + + + + + Brickley, + D. + , & + + Guha, + R.V. + + ( + 2004). RDF Vocabulary Description Language 1.0: RDF Schema. W3C Recommendation, February 10, 2004, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/rdf‐schema/ + + + + + + Ciccarese, + P. + , + + Wu, + E. + , + + Kinoshita, + J. + , + + Wong, + G. + , + + Ocana, + M. + , + + Ruttenberg, + A. + , & + + Clark, + T. + + ( + 2008). + The SWAN biomedical discourse ontology. + Journal of Biomedical Informatics, + 41( + 5), + 739– + 751. + + + + + + DeRose, + S. + ( + 2004). + Markup overlap: A review and a horse. In + Proceedings of the Extreme Markup Languages 2004. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://conferences.idealliance.org/extreme/html/2004/DeRose01/EML2004DeRose01.html + + + + + + Di Iorio, + A. + , + + Marchetti, + C. + , + + Schirinzi, + M. + , & + + Vitali, + F. + + ( + 2009). + Natural and multi‐layered approach to detect changes in tree‐based textual documents. In + + J. + Cordeiro + & + + J. + Filipe + (Eds.), + Proceedings of the 11th International Conference on Enterprise Information Systems (ICEIS 2009) (pp. + 90– + 101). + Heidelberg, Germany: + Springer. + + + + + + Di Iorio, + A. + , + + Peroni, + S. + , & + + Vitali, + F. + ( + 2009). + Towards markup support for full GODDAGs and beyond: The EARMARK approach. In + Proceedings of Balisage: The Markup Conference 2009. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://balisage.net/Proceedings/vol3/html/Peroni01/BalisageVol3‐Peroni01.html + + + + + + Di Iorio, + A. + , + + Peroni, + S. + , & + + Vitali, + F. + + ( + 2010). + Handling markup overlaps using OWL. In + + P. + Cimiano + & + + H. S. + Pinto + (Eds.), + Proceedings of the 17th International Conference on Knowledge Engineering and Knowledge Management (EKAW 2010) (pp. + 391– + 400). + Heidelberg, Germany: + Springer. + + + + + + Di Iorio, + A. + , + + Peroni, + S. + , & + + Vitali, + F. + (in press). + Using Semantic Web technologies for analysis and validation of structural markup. + International Journal of Web Engineering and Technology. + + + + + + Drummond, + N. + , + + Rector, + A. + , + + Stevens, + R. + , + + Moulton, + G. + , + + Horridge, + M. + , + + Wang, + H.H. + , & + + Seidenberg, + J. + + ( + 2006). + Putting OWL in order: Patterns for sequences in OWL. In + + B. C. + Grau + , + + P. + Hitzler + , + + C. + Shankey + , & + + E. + Wallace + (Eds.), + Proceedings of the Workshop on OWL: Experiences and Directions (OWLED 2006), + Athens, GA. Retrieved from + http://sunsite.informatik.rwth‐aachen.de/Publications/CEUR‐WS/Vol‐216/submission_12.pdf + + + + + + Durand, + D.G. + + ( + 1994, October). + Palimpsest, a data model for revision control. Paper presented at the Workshop on Collaborative Editing Systems at the Computer Supported Cooperative Work Conference (CSCW94), Chapel Hill, NC. + + + + + + Durand, + D.G. + ( + 2008). + Palimpsest: Change‐oriented concurrency control for the support of collaborative applications. + Charleston, SC: + CreateSpace. + + + + + + Garlik, + S.H. + , & + + Seaborne, + A. + + ( + 2010). SPARQL 1.1 Query Language. W3C Working Draft, October 14, 2010, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/sparql11‐query/ + + + + + + Georg, + R. + , + + Schonefeld, + O. + , + + Trippel, + T. + , & + + Witt, + A. + + ( + 2010). + Sustainability of linguistic resources revisited. In + Proceedings of the International Symposium on XML for the Long Haul: Issues in the Long‐Term Preservation of XML. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://www.balisage.net/Proceedings/vol6/html/Witt01/BalisageVol6‐Witt01.html + + + + + + Goldfarb, + C.F. + ( + 1990). + The SGML Handbook. + New York, NY: + Oxford University Press. + + + + + + Horridge, + M. + , & + + Patel‐Schneider, + P. + + ( + 2009). OWL 2 Web Ontology Language: Manchester Syntax. W3C Working Group Note October 27, 2009, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/owl2‐manchester‐syntax/ + + + + + + Horrocks, + I. + , + + Patel‐Schneider, + P.F. + , + + Boley, + H. + , + + Tabet, + S. + , + + Grosof, + B. + , & + + Dean, + M. + + ( + 2004). SWRL: A Semantic Web rule language combining OWL and RuleML. W3C Member Submission, May 21, 2004, World Wide Web Consortium. Retrieved + http://www.w3.org/Submission/SWRL/ + + + + + + Huitfeldt, + C. + , & + + Sperberg‐McQueen, + C.M. + + ( + 2003). TexMECS: An experimental markup meta‐language for complex documents. Retrieved from + http://decentius.aksis.uib.no/mlcd/2003/Papers/texmecs.html + + + + + JTC1/SC34 WG 4. ( + 2008). ISO/IEC 29500‐1:2008—Information technology—Document description and processing languages—Office Open XML File Formats: Part 1. + Fundamentals and markup language reference. + Geneva, Switzerland: + International Organization for Standardization. + + + + + JTC1/SC34 WG 6. ( + 2006). + ISO/IEC 26300:2006—Information technology—Open document format for office applications (OpenDocument), Version 1.0. + Geneva, Switzerland: + International Organization for Standardization. + + + + + + Marcoux, + Y. + , & + + Rizkallah, + E. + + ( + 2009). + Intertextual semantics: A semantics for information design. + Journal of the American Society for Information Science and Technology, + 60( + 9), + 1895– + 1906. + + + + + + Marinelli, + P. + , + + Vitali, + F. + , & + + Zacchiroli, + S. + + ( + 2008). + Towards the unification of formats for overlapping markup. + New Review of Hypermedia and Multimedia, + 14( + 1), + 57– + 94. + + + + + + Nelson, + T. + ( + 1980). + Literary machines: The report on, and of, Project Xanadu concerning word processing, electronic publishing, hypertext, thinkertoys, tomorrow's intellectual ⋖ including knowledge, education and freedom. + Sausalito, CA: + Mindful Press. + + + + + + Peroni, + S. + , & + + Vitali, + F. + + ( + 2009). + Annotations with EARMARK for arbitrary, overlapping and out‐of order markup. In + + U.M. + Borghoff + & + + B. + Chidlovskii + (Eds.), + Proceedings of the 2009 ACM Symposium on Document Engineering (DocEng 2009) (pp. + 171– + 180). + New York, NY: + ACM. + + + + + + Portier, + P. + , & + + Calabretto, + S. + + ( + 2009). + Methodology for the construction of multi‐structured documents. In + Proceedings of Balisage: The Markup Conference 2009. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://balisage.net/Proceedings/vol3/html/Portier01/BalisageVol3‐Portier01.html + + + + + + Riggs, + K.R. + + ( + 2002). + XML and free text. + Journal of the American Society for Information Science and Technology, + 53( + 6), + 526– + 528. + + + + + + Salembier, + P. + , & + + Benitez, + A.B. + + ( + 2007). + Structure description tools. + Journal of the American Society for Information Science and Technology, + 58( + 9), + 1329– + 1337. + + + + + + Schmidt, + D. + + ( + 2009). + Merging multi‐version texts: A generic solution to the overlap problem. In + Proceedings of Balisage: The Markup Conference 2009. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://balisage.net/Proceedings/vol3/html/Schmidt01/BalisageVol3‐Schmidt01.html + + + + + + Schmidt, + D. + , & + + Colomb, + R. + + ( + 2009). + A data structure for representing multi‐version texts online. + Journal of Human–Computer Studies, + 67( + 6), + 497– + 514. + + + + + + Schonefeld, + O. + , & + + Witt, + A. + ( + 2006). + Towards validation of concurrent markup. In + Proceedings of the Extreme Markup Languages 2006. + Rockville, MD: + Mulberry Technologies. Retrieved from h + ttp://conferences.idealliance.org/extreme/html/2006/Schonefeld01/EML2006Schonefeld01.html + + + + + + Sperberg‐McQueen, + C.M. + + ( + 2006). + Rabbit/duck grammars: A validation method for overlapping structures. In + Proceedings of Extreme Markup Languages Conference 2006. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://conferences.idealliance.org/extreme/html/2006/SperbergMcQueen01/EML2006SperbergMcQueen01.html + + + + + + Sperberg‐McQueen, + C.M. + , & + + Huitfeldt, + C. + + ( + 2004). + GODDAG: A data structure for overlapping hierarchies. In + + P.R. + King + & + + E.V. + Munson + (Eds.), + Proceeding of the 5th International Workshop on the Principles of Digital Document Processing (PODDP 2000) (pp. + 139– + 160). + Heidelberg, Germany: + Springer. + + + + + TEI Consortium. ( + 2005). TEI P5: Guidelines for electronic text encoding and interchange. Retrieved from + http://www.tei‐c.org/Guidelines/P5 + + + + + + Tennison, + J. + , & + + Piez, + W. + + ( + 2002, August). The Layered Markup and Annotation Language (LMNL). Paper resented at the Extreme Markup Languages Conference 2002, Montreal, Canada. + + + + + + Tummarello, + G. + , + + Morbidoni, + C. + , & + + Pierazzo, + E. + + ( + 2005). + Toward textual encoding based on RDF. In + + M. + Dobreva + & + + J. + Engelen + (Eds.), + Proceedings of the Ninth ICCC International Conference on Electronic Publishing (ELPUB2005). + Leuven, Belgium: + Peeters. + + + + + W3C OWL Working Group. ( + 2009). OWL 2 web ontology language document overview. W3C Recommendation, October 27, 2009, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/owl2‐overview/ + + + + +
\ No newline at end of file diff --git a/tools/.gitignore b/tools/.gitignore new file mode 100644 index 0000000..c2658d7 --- /dev/null +++ b/tools/.gitignore @@ -0,0 +1 @@ +node_modules/ diff --git a/tools/parse_doctype b/tools/parse_doctype new file mode 120000 index 0000000..f000651 --- /dev/null +++ b/tools/parse_doctype @@ -0,0 +1 @@ +./node_modules/get-doctype/parse-doctype \ No newline at end of file diff --git a/xml/bad-formed.xml b/xml/bad-formed.xml new file mode 100644 index 0000000..683d67a --- /dev/null +++ b/xml/bad-formed.xml @@ -0,0 +1,199 @@ + + + + +Springer Berlin Heidelberg +Berlin, Heidelberg + + + +10 +0724-6145 +1616-8542 +Advances in Biochemical Engineering +Adv Biochem Eng Biotechnol + + + + + +Prof. Dr. +T. +K. +Ghose + + + +Prof. Dr. +A. +Fiechter + + + + +Prof. Dr. +N. +Blakebrough + + + + +Prof. Dr. +S. +Aiba + + + + +Prof. Dr. +B. +Atkinson + + + + +Dr. +J. +Böing + + + + +Prof. Dr. +J. +R. +Bourne + + + + +Dr. +E. +Bylinkina + + + + +Prof. Dr. +H. +Dellweg + + + + +Dr. +A. +L. +Demain + + + + +Prof. Dr. +R. +Finn + + + + +Dr. +K. +Kieslich + + + + +Prof. Dr. +R. +M. +Lafferty + + + + +Prof. Dr. +M. +Moo-Young + + + + +Dr. +I. +Nüesch + + + + +Prof. Dr. +L. +K. +Nyiri + + + + +Prof. Dr. +H. +J. +Rehm + + + + +Prof. Dr. +P. +L. +Rogers + + + + +Prof. Dr. +W. +Schmidt-Lorenz + + + + +Prof. Dr. +H. +Suomalainen + + + + +Prof. Dr. +F. +Wagner + + + + + + +3-540-08990-X +Advances in Biochemical Engineering, Volume 11 +11 +10.1007/3-540-08990-X +6949 +978-3-540-08990-2 +978-3-540-35678-3 +5 + +Springer-Verlag +1979 + + +Chemistry +Chemistry/Food Science, general +Chemistry and Materials Science + + + + + + + + diff --git a/xml/oup-sample.xml b/xml/oup-sample.xml new file mode 100644 index 0000000..666b5ac --- /dev/null +++ b/xml/oup-sample.xml @@ -0,0 +1,103 @@ + + +
+ + + litlin + litlin + Literary and Linguistic Computing + Lit Linguist Computing + 0268-1145 + 1477-4615 + + Oxford University Press + + + + fqh047 + 10.1093/llc/fqh047 + + + Articles + + + + < + teiPublisher>: A Repository Management System for TEI Documents + + + + + Kumar + Amit + + + + + Schreibman + Susan + + + + + Arneil + Stewart + + + + + Holmes + Martin + + + + + Bia + Alejandro + + + + + Walsh + John + + + Graduate School of Library Information Sciences, University of Illinois at Urbana-Champaign, USA University of Maryland Libraries, USA University of Victoria Humanities Computing and Media Centre, Canada Operating Research Center, Miguel Hernández University, Spain Digital Library Program/University Information Technology Services, Indiana University + + + The Graduate School of Library and Information Science, University of Illinois at Urbana-Champaign, 501E. Daniel Street, Champaign, IL 61820, USA + E-mail: + amitku@uiuc.edu + + + + 03 + 2005 + + 20 + 1 + 117 + 132 + + © The Author. Published by Oxford University Press on behalf of ALLC and ACH. All rights reserved. For Permissions, please email: journals.permissions@oupjournals.org + 2005 + + +

Digital Humanities (DH) and Digital Library (DL) projects are complex systems that require specialized programming skills. Many encoders cannot take their work to the next level by transforming their collections of structured XML texts into a web searchable and browsable database. Often teams of text encoders are able to encode their texts with a high degree of sophistication, but unless they have funds to hire a programmer, their collections far too often remain on local disk storage away from public access. < + teiPublisher> aims to relieve some of this burden by providing the tools to manage an extensible, modular and configurable XML-based repository which will house, search, browse, and display documents encoded in TEI-Lite on the world wide web. < + teiPublisher> provides an administrative interface that allows DL and DH administrators to upload and delete documents from a web accessible repository; analyze XML documents to determine elements for searching and browsing; refine ontology development; select inter and intra document links; partition the repository into collections; create backups; generate search, browse, and display pages; customize the interface; and associate XSL transformation scripts and CSS stylesheets to obtain different target outputs (HTML, PDF, etc.).

+
+ + + hwp-legacy-fpage + 117 + + + hwp-legacy-dochead + Articles + + +
+
+
\ No newline at end of file diff --git a/xml/parsing-problem.xml b/xml/parsing-problem.xml new file mode 100644 index 0000000..a698fca --- /dev/null +++ b/xml/parsing-problem.xml @@ -0,0 +1 @@ +
\ No newline at end of file diff --git a/xml/wiley-sample-bis.xml b/xml/wiley-sample-bis.xml new file mode 100644 index 0000000..1336211 --- /dev/null +++ b/xml/wiley-sample-bis.xml @@ -0,0 +1,1776 @@ + + +
+ + + Wiley Subscription Services, Inc., A Wiley Company + Hoboken + + 10.1002/(ISSN)1532-2890 + 1532-2882 + 1532-2890 + + + + + Journal of the American Society for Information Science and Technology + J. Am. Soc. Inf. Sci. + + + + Journal of the American Society for Information Science + 0002-8231 + 1097-4571 + 2000 + 51 + 14 + + + + + 10.1002/asi.v62.9 + + 62 + 9 + + September 2011 + + + 10.1002/asi.21591 + + + + + + + + Research Article + Research Articles + + © 2011 ASIS&T + + + + + + + + + + + + + 1696 + 1716 + + + data formats + overlap + markup languges + semantic web + data conversion + + + + + + + + + + + + + A Semantic Web approach to everyday overlapping markup + + + + + Angelo + Di Iorio + + + diiorio@cs.unibo.it + + + + + Silvio + Peroni + + + speroni@cs.unibo.it + + + + + Fabio + Vitali + + + fabio@cs.unibo.it + + + + + + Department of Computer Science, University of Bologna, Bologna, Italy + + + + + Abstract +

Overlapping structures in XML are not symptoms of a misunderstanding of the intrinsic characteristics of a text document nor evidence of extreme scholarly requirements far beyond those needed by the most common XML‐based applications. On the contrary, overlaps have started to appear in a large number of incredibly popular applications hidden under the guise of syntactical tricks to the basic hierarchy of the XML data format. Unfortunately, syntactical tricks have the drawback that the affected structures require complicated workarounds to support even the simplest query or usage. In this article, we present Extremely Annotational Resource Description Framework (RDF) Markup (EARMARK), an approach to overlapping markup that simplifies and streamlines the management of multiple hierarchies on the same content, and provides an approach to sophisticated queries and usages over such structures without the need of ad‐hoc applications, simply by using Semantic Web tools and languages. We compare how relevant tasks (e.g., the identification of the contribution of an author in a word processor document) are of some substantial complexity when using the original data format and become more or less trivial when using EARMARK. We finally evaluate positively the memory and disk requirements of EARMARK documents in comparison to Open Office and Microsoft Word XML‐based formats.

+
+
+
+
+ + +
+ Introduction +

The overwhelming consensus among XML practitioners is that documents are trees, the hierarchy is the fundamental data structure, and violations of the hierarchy are errors or unnecessary complications. Therefore, overlapping markup has received ambivalent, almost schizoid considerations in the field of markup languages. Traditionally, overlaps were the hallmarks of bad HTML coders and nave HTML page editors, taking advantage of the unjustified benevolence in web browsers that would display basically any HTML regardless of proper nesting. At the same time, far from the awareness of the general public, overlaps have been a fringe, almost esoteric, discipline of scholars in the humanities, competently used for arcane specifications of linguistic annotations and literary analysis.

+

Although the first type of overlap was judged with scorn and the second with awe, they both fundamentally represent a situation that is more common than was thought, and the scholars were only more aware, and not more justified, about the need to represent overlaps.

+

Generally, overlap is needed when multiple independent items refer to the same segment, either when considering textual markup documents or multimedia structures (Salembier & Benitez, + 2007). Regarding documents with markup, we need overlap whenever multiple markup elements need to be applied over the same content, and these elements happen to be independent of each other. In some (rather frequent) situations, this independence means that the content referred to by some elements is partially, but not completely, the same as the content referred to by other elements.

+

This situation is more frequent than it may appear: Not only do bad HTML code and arcane linguistic annotations use overlap, but many more mainstream and mundane examples exist. For instance, change tracking in an office document is often at odds with the underlying structure of the text, microformats (Allsopp, + 2007) and Resource Description Framework–in–attributes (RDFa; Adida, Birbeck, McCarron, & Pemberton, + 2008) annotations may need to refer to concepts that span across multiple XML elements, complex data structures (e.g., biological data) force graphs into trees and hide multiple parentage as internal references, and so on.

+

Differently from SGML, which is able to handle some overlapping scenarios through the CONCUR notation (Goldfarb, + 1990), XML grammatically imposes and requires a strict hierarchy of containment generating a single mathematical tree of the document where no overlap is allowed. This requirement has been turned into an intrinsic characteristic of the documents XML was meant to represent rather than a syntactical and conceptual constraint into which these documents need to fit. Thus, whenever authors needed to cope with independent markup elements, they managed either by navely ignoring the hierarchical limitation (and therefore creating invalid documents) or by creating careful workarounds within the syntactical constraint, or even by inventing completely new markup languages that allow some types of overlap. But while new multihierarchical markup languages such as TexMecs (Huitfeldt & Sperberg‐McQueen, + 2003) and LMNL (Tennison & Piez, + 2002) have a small number of adepts and applications, and while bad HTML coders and bad HTML page editors are disappearing from the market, the careful workarounds within the XML syntax (TEI Consortium, + 2005), such as segmentation, milestones, or standoff markup, are to this day frequently used and ubiquitous.

+

All workarounds share the same approach of hiding structural information about a secondary hierarchy under the guise of something else: split individual elements, empty boundary elements, indirect references, and so on. The result is that the secondary structural information is hidden or its importance is lessened to not break or obfuscate the main hierarchy expressed in the visible XML structure. But this comes at a price: Structures specified through workarounds are more difficult to find, identify, and act upon than the are structures in the main XML hierarchy. Thus, trivial searches that should amount to a short XPath in a more direct situation end up being multiple‐lines long, pretty basic visualizations require incredibly complex XSLT stylesheets, specific choices of the main markup hierarchy actually prevent some features of the secondary markup to even exist, and so on. So, although workarounds exist and can be used, hierarchies expressed through them are “second‐class citizens” that cannot fully exploit the sophisticated tools that the XML language provides.

+

In this article, we show how Extremely Annotational Resource Description Framework (RDF) Markup (EARMARK), our proposal for managing overlapping markup, does not generate first‐ and second‐class hierarchies, and allows existing, sophisticated tools to be used on all markup—even in the presence of overlaps. Rather than creating a completely new language requiring completely new tools and competencies, EARMARK uses Semantic Web technologies and Semantic Web tools to obtain many of the results obtainable with usual XML tools.

+

EARMARK defines markup vocabularies by means of OWL ontologies (W3C OWL Working Group, + 2009). Since each individual markup item is an independent assertion over some content or some other assertions, overlaps of content are not a problem, nor are all the issues connected to physical embedding and containments, such as contiguity and document order. Furthermore, by using standard Semantic Web technologies, fairly sophisticated functionalities can be provided over EARMARK documents.

+

Through EARMARK, operations that were previously very hard or impossible exactly because of the interferences of the multiple hierarchies or of the workarounds they employed now become fundamentally trivial since no syntactical tricks are employed and the different hierarchies do not interfere with each other. Thus, for instance, identifying the individual contributions in a multi‐authored MS Word or Open Office document is quite hard on their original XML formats, and becomes trivial when the same documents are converted into EARMARK.

+

This article is an extended version of previous works on EARMARK (Di Iorio, Peroni, & Vitali, + 2010; Peroni & Vitali, + 2009). In those works, we focused on identifying workarounds for overlapping data existing in real XML documents and translating them into EARMARK assertions. We also sketched the EARMARK ontology and presented a simple implementation of EARMARK‐aware tools. This article follows and extends them and also provides some novel contributions: + + +

The systematic analysis of the EARMARK model, with particular attention to data typing and overlapping structures

+ + +

The discussion of further applications for the ontological EARMARK approach. In particular, we show how EARMARK can be used to improve the content filtering and reversions mechanisms of wikis.

+
+ +

The brief description of a process, called ROCCO, for generating EARMARK documents from existing XML documents (even ones that use workarounds for overlapping structures)

+
+ +

An evaluation of EARMARK efficiency when dealing with multiple hierarchies in comparison with the XML structures used by popular XML‐based formats such as Office Open and MS Word.

+
+ +

+

The article is structured as follows: First, we provide a brief overview of existing approaches to overlap using workarounds in XML or ad hoc markup metalanguages, and then give a few examples of situations where overlaps are used today and sometimes in rather mainstream situations. Next, we present the EARMARK model and its rules. Then, we provide some use cases that are meant to demonstrate the superiority of the EARMARK approach to a traditional XML format, especially when overlaps come into question, and show the generation of EARMARK documents, converting legacy documents. An initial evaluation of the efficiency of EARMARK compared to popular XML based data formats such as Open Document (ODT) and Office Open XML (OOXML) is presented, followed by our onclusions.

+
+
+ Existing Approaches to Overlapping +

The need for multiple overlapping structures over documents using markup syntaxes such as XML and SGML is an age‐old issue, and a large amount of literature exists on the techniques, languages, and tools that allow users to create multiple entangled hierarchies over the same content. A good review can be found in DeRose ( + 2004).

+

Some research has proposed using plain hierarchical markup (i.e., XML) and employing specially tailored elements or attributes to express the semantics of overlapping in an implicit way. The TEI Guidelines (TEI Consortium, + 2005) presented a number of different techniques that use SGML/XML constructs to force multiple hierarchies into a single one, including: + + +

milestones (i.e., verlapping structures are expressed through empty elements to mark the boundaries of the “content”),

+ + +

fragmentation (i.e., overlapping structures are split into individual, nonoverlapping elements that may even be linked through id–idref pairs), and

+
+ +

standoff markup (i.e., overlapping structures are placed elsewhere and indirectly refer to their would‐be locations through pointers, locators, and/or id–idref pairs).

+
+ +

+

Given the large number of techniques to deal with overlapping structures in XML, in Marinelli, Vitali, and Zacchiroli ( + 2008), we presented a number of algorithms to convert XML documents with overlapping structures from and to the most common approaches, as well as a prototype implementation.

+

Riggs ( + 2002) introduced a slightly different technique for fragmentation within XML structures. In this proposal, floating elements (i.e., those elements that do not fall in a proper or meaningful hierarchical order) are created using the name of the element followed by an index referring to its semantically related parent element. For example, the floating element <name.person[2]>John</name.person [2] means that <name>John</name> is semantically a child of the second occurrence of the element person, even though the floating element is not structurally contained by its logical parent.

+

Other research even has proposed to get rid of the theory of trees at the base of XML/SGML altogether and use different underlying models and newly invented XML‐like languages that allow the expression of overlaps through some kind of syntactical flourishing. For instance, a general ordered‐descendant directed acyclic graph (GODDAG; Sperberg‐McQueen & Huitfeldt, + 2004) is a family of graph‐theoretical data structures to handle overlapping markup. A GODDAG's nodes represent markup elements and text. Arcs are used to explicitly represent containment and father–child relations. Since multiple arcs can be directed to the same node, overlapping structures can be straightforwardly represented in GODDAG. Full GODDAGs cannot be linearized in any form using embedded markup, but restricted GODDAGs, a subset thereof, can be and have been linearized into TexMecs (Huitfeldt & Sperberg‐McQueen, + 2003), a multihierarchical markup language that also allows full GODDAGs through appropriate nonembedding workarounds such as standoff markup.

+

LMNL (Tennison & Piez, + 2002) is a general data model based on the idea of layered text fragments and ranges, where multiple types of overlap can be modeled using concepts drawn from the mathematical theory of intervals. Multiple serializations of LMNL exist, such as CLIX and LMNL‐syntax.

+

XConcur (Schonefeld & Witt, + 2006) is a similar solution based on the representation of multiple hierarchies within the same document through layers. Strictly related to its predecessor CONCUR as it was included in the SGML, XConcur was developed in conjunction with the validation language XConcur‐CL to handle relationships and constraints between multiple hierarchies.

+

The variant graph approach (Schmidt & Colomb, + 2009) also is based on graph theory. Developed to deal with textual variations that generate multiple versions of the same document with multiple overlapping hierarchies, this theory proposes a new data model to represent literary documents and a graph linearization (based on lists) that scales well even with a large number of versions. The same authors recently presented an extension of their theory that also allows users to merge multiple variants into one document (Schmidt, + 2009). In Portier and Calabretto ( + 2009), a detailed survey about overlapping approaches was presented, and also discussed the MultiX2 data model, which uses W3C standard languages such as XInclude to link and fetch text fragments within overlapping structures, and a prototype editor for the creation of multistructured documents.

+

Tummarello, Morbidoni, and Pierazzo ( + 2005) proposed using RDF as a standoff notation for overlapping structures of XML documents. Since this proposal has many affinities with the one we are presenting in this article, we later discuss its characteristics and compare it with ours.

+
+
+ More Frequent Than One May Think: Overlapping in the Wild +

Overlapping structures have been considered often as appropriate only in highly specific contexts and basically for scholars: The solutions that have been proposed in the literature were complex since they were considered grounded in the intrinsic complexity of the topics themselves. Yet, overlapping structures can be found in many more fields than these, and even mainstream applications generate and use markup with overlapping structures. While the complexity of overlapping is hidden to the final user, applications that consume such data may very well find it rather difficult to handle such information. We next discuss three very different contexts where overlapping already exists and fairly relevant information is encoded in multiple independent structures, leaving to special code the task of managing the complexity.

+
+ Change Tracking in Office Document Formats +

Word processors such as Microsoft Word and Open Office provide users with powerful tools for tracking changes, allowing each individual modification by individual authors to be identified, highlighted, and acted upon (e.g., by accepting or discarding them). The intuitiveness of the relevant interfaces actually hides the complexity of the data format and of the algorithms necessary to handle such information.

+

For instance, the standard ODT format (JTC1/SC34 WG 6, + 2006) used by Open Office, when saving change‐tracking information, relies on two specific constructs for insertions and deletions that may overlap with the structural markup. Adding a few words within a paragraph is not in itself complex, as it does not require the breaking of the fundamental structural hierarchy; conversely, changes that affect the structure itself (e.g., the split of one paragraph in two by the insertion of a return character, or the joining of two paragraphs by the elimination of the intermediate return character) require that annotations are associated to the end of a paragraph and the beginning of the next, in an unavoidably overlapping pattern. ODT uses milestones and standoff markup for insertions and deletions, respectively, and also relies on standoff markup for annotations about the authorship and date of the change.

+

For instance, the insertion of a return character and a few characters in a paragraph creates a structure as follows:

+

+ + + + + + +

+

The empty elements <text:change‐start/> and <text:change‐end/> are milestones marking the beginning and the end, respectively, of the range that constituted the insertion while the element <text:insertion>, before the beginning of the document content, is standoff markup for the metadata about the change (author and date information).

+

Similarly, a deletion creates a structure as follows:

+

+ + + + + + +

+

The element <text:change/> represents a milestone of the location where the deletion took place in the content, and the corresponding standoff markup annotation <text:deletion> contains not only the metadata about the change but also the text that was deleted.

+

The OOXML format (JTC1/SC34 WG 4, + 2008) (the XML‐based format used by Microsoft Office 2007 and standardized by ISO in 2008), on the other hand, uses a form of segmentation to store change‐tracking information across all previous elements involved.

+

+ + + + + + +

+

This heavily simplified version of an OOXML document shows two separate changes: (a) the insertion of a return character and (b) the insertion of a word. These modifications are not considered as a single change; therefore, the segments are not connected to each other but simply created as needed to fit the underlying structure.

+

In fact, change tracking in OOXML is a fairly complex proposition. Although providing more complete coverage of special cases and situations than does ODT, dealing with its intricacies is not for the casual programmer. Even a simple XSLT stylesheet to show inserted text in a different color and hide deleted text may run several hundred lines of code. + 1 + +

+ http://OOXMLdeveloper.org/archive/2006/09/07/625.aspx +

+ +

+
+
+ Overlapping With Microformats +

Microformats (Allsopp, + 2007) add semantic markup to web documents by using common structures of the HTML language itself—in particular, the class attribute.

+

The HTML code is annotated using microformats to provide new semantic, machine‐processable assertions. In the following example, a plain HTML table is enriched with metadata about events + 2 + +

HCalendar, + http://microformats.org/wiki/hcalendar +

+ + and people: + 3 + +

HCard, + http://microformats.org/wiki/hcard +

+
+

+

+ + + + + + +

+

The table was enriched by additional data declaring it to be an event (a conference), and data about the event itself (URL, summary, location) and about four relevant individuals (with their names and roles within the conference) were associated where necessary to the actual content of the table.

+

So far, so good, and no overlap to speak about. Things change dramatically, though, when the overall structure of the main hierarchy (the HTML table) is at odds with the intrinsic hierarchy of the microformat data, such as if the people are organized in columns rather than rows. For instance:

+

+ + + + + + +

+

Unfortunately, vcards are a hierarchy themselves, and if the hierarchy of vcards is organized differently from the hierarchy of the HTML table, as in the latter case, it is just impossible to define the four vcards for the four people organizing the conference. Thus, in plain HTML, the choice of one of two possible presentation models for the main hierarchy of content makes trivial or completely impossible the existence of the second hierarchy.

+

A possible and partial solution to express vcard hierarchies in the latter example is RDFa (Adida et al., + 2008), a W3C recommendation. It describes a mechanism to embed RDF statements into HTML documents by using some HTML attributes (href, rel, rev, content) in combination with other ad hoc attributes (property, about, typeof) proposed in the recommendation itself.

+

+ + + + + + +

+

Since all attributes live in the context of elements, the price to pay is that to assert everything we want to assert, we often need to add some structurally unnecessary elements to the current markup hierarchy of a document, needed only to add the RDF statements (e.g., the span elements emphasized earlier). Even if that does not represent a significant problem for strict Semantic Web theorists, document architects and markup experts see this as a kludge and an inelegant compromise.

+
+
+
+ Wikis: No Overlapping Where Some Should Be +

The strength of wikis lies in their allowing users to modify content at any time. The mechanisms of change‐tracking and rollback that are characteristics of all wikis, in fact, promote users' contributions and make “malicious attacks” pointless in the long run since previous versions can be easily restored.

+

A number of tools exist that automatically discover “wiki vandalisms” and provide users with powerful interfaces to surf changes, identify differences between subsequent versions, and revert content. For instance, Huggle + 4 + +

+ http://en.wikipedia.org/wiki/Wikipedia:Huggle +

+ + is an application dealing with vandalism in Wikipedia, based on a proxy architecture and .NET technologies. A straightforward interface allows users to access any version of a page, highlights contributions of a specific user, and reverts the content to old versions.

+

Even client‐side tools—meant to be installed as browser extensions or bookmarklets—exist to extend the rollback mechanisms of Wikipedia, giving users more flexibility and control over (vandalistic) changes. For instance, Lupin + 5 + +

+ http://en.wikipedia.org/wiki/User:Lupin/Anti‐vandal_tool +

+ + is a set of javascript scripts that check a wiki page against a list of forbidden terms so that authors can identify undesirable modifications and restore previous (i.e., good) versions without a continuous control over the full content of the page; yet again, Twinkle + 6 + +

+ http://en.wikipedia.org/wiki/Wikipedia:Twinkle +

+
+ provides users powerful rollback functions and includes a full library of batch deletion functions, automatic reporting of vandals, and user notification functions.

+

These tools are successful in highlighting vandalism and in identifying versions created by malicious users. However, although it is possible to revert the page to any previous version, all changes (even acceptable ones) that were subsequent to the malicious version cannot be automatically inherited by the restored page.

+

For instance consider Versions V1, V2, and V3 of a wiki page, where Version V1 contains a baseline (i.e., acceptable) content, and Version V2 is identified as a partial vandalism and is agreed to be removed, but Version V3 contains (possibly, in a completely different section than the target of the malicious attack) relevant and useful content that was added before the vandalistic Version V2 was declared as such. The task of removing the modifications of Version V2 while maintaining (whatever is possible of) Version V3 is a difficult, error‐prone, and time‐consuming task if done manually, yet there is no tool we are aware of that automatically filters contributions from multiple versions and merges them into a new one (or, equivalently, removes only selected intermediate versions).

+

However, it is possible to theoretically characterize the interdependencies between subsequent changes to a document. In fact, literature has existed for a long time on exactly these themes (e.g., Durand, + 1994, + 2008). Although a detailed discussion of abstract models of interconnected changes is out of scope for this article (Details and authoritative references can be found in the aforementioned works.), what is relevant in this discussion is that they happen to assume a hierarchical form that is frequently at odds with the hierarchical structure of the content of the document, and as such, most issues derive from the data structures in which content is stored and from the model for manipulating these structures. For instance, the fact that in the wiki perspective each version is an independent unit that shares no content (even unchanged content) with the other versions prevents considering multiple versions as overlapping structures coexisting on the same document. If we were able to make these hierarchies explicit, we would be able to create models and tools to manipulate these documents in a more powerful way and to exploit the existing interconnections between the overlapping hierarchies.

+
+
+ Introduction to EARMARK and Its Support for Overlapping Features +

The presence of hidden overlapping structures—transparent to users, but very difficult to handle by applications—is the common denominator for the scenarios described in the previous section. More than the overlap itself, which cannot be ignored because it does exist and carries important meanings, the problem we face lies in the way applications store such overlapping structures. In the XML world, in fact, the only way to do so is through the use of (complex) workarounds that force the multiple hierarchies into one hierarchy of an XML document. That makes it very tricky to perform sophisticated analysis and searches.

+

This section discusses a different approach to metamarkup, EARMARK (Di Iorio, Peroni, & Vitali, + 2009; Di Iorio et al., + 2010; Peroni & Vitali, + 2009) based on ontologies and Semantic Web technologies. The basic idea is to model EARMARK documents as collections of addressable text fragments, and to associate such text content with OWL assertions that describe structural features as well as semantic properties of (parts of) that content. As a result, EARMARK allows not only documents with single hierarchies (as with XML) but also multiple overlapping hierarchies where the textual content within the markup items belongs to some hierarchies, but not to others. Moreover, EARMARK makes it possible to add semantic annotations to the content though assertions that may overlap with existing ones.

+

One of the advantages of using EARMARK is the capability to access and query documents by using well‐known and widely supported tools for the Semantic Web. In fact, EARMARK assertions are simply RDF assertions while EARMARK documents are modeled through OWL ontologies. The consequence is that query languages (e.g., SPARQL; Garlik & Seaborne, + 2010) and actual existing tools such as Jena + 7 + +

+ http://jena.sourceforge.net +

+ + and Pellet + 8 + +

+ http://pellet.owldl.com +

+
+ can be directly used to deal with even incredibly complicated overlapping structures. What is very difficult (or impossible) to do with traditional XML technologies becomes much easier with these technologies under the EARMARK approach.

+

In the rest of this section, we give a brief overview of the EARMARK model and then describe how EARMARK can be used to deal with the issues presented earlier. The model itself is defined through an OWL document, + 9 + +

+ http://www.essepuntato.it/2008/12/earmark +

+ + summarized in Figure + 1, specifying classes and relationships. We distinguish between ghost classes, which define the general model, and shell classes, which are actually used to create EARMARK instances.

+
+ + + + + + + + +

A UML‐like representation of the EARMARK ontology. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+
+ Ghost Classes +

The ghost classes describe three disjoint base concepts—docuverses, ranges, and markup items—through three different and disjoint OWL classes. + 10 + +

All our OWL samples are presented using the Manchester Syntax (Horridge & Patel‐Schneider, + 2009), which is one of the standard linearization syntaxes of OWL. The prefixes rdfs and xsd refer to RDF Schema and XML Schema namespaces, respectively, while the empty prefix refers to the EARMARK ontology URI plus “#.” Moreover, we use the prefix c to indicate entities taken from an imported ontology made for the SWAN project (Ciccarese et al., + 2008); available at + http://swan.mindinformatics.org/spec/1.2/collections.html +

+ +

+

The textual content of an EARMARK document is conceptually separated from its annotations, and is referred to through the Docuverse class. + 11 + +

This class (and its name) is based on the concept introduced by Ted Nelson ( + 1980) in his Xanadu Project to refer to the collection of text fragments that can be interconnected to each other and transcluded into new documents.

+ + The individuals of this class represent the object of discourse (i.e., all the containers of text of an EARMARK document).

+

+ + + + + + +

+

Any individual of the Docuverse class—commonly called a docuverse (lowercase to distinguish it from the class)—specifies its actual content with the property hasContent.

+

We then define the class Range for any text lying between two locations of a docuverse. A range (i.e., an individual of the class Range) is defined by a starting and an ending location (any literal) of a specific docuverse through the properties begins, ends, and refersTo, respectively.

+

+ + + + + + +

+

There is no restriction on locations used for the begins and ends properties. That is very useful because it allows us to define ranges that follow or reverse the text order of the docuverse to which they refer. For instance, the string “desserts” can be considered both in document order, with the begins location lower than the ends location, or in the opposite order, forming “stressed.” + 12 + +

+ http://en.wikipedia.org/wiki/Palindrome#Semordnilaps +

+ + Thus, the values of the properties' begins and ends define the way a range must be read.

+

The class MarkupItem is the superclass defining artifacts to be interpreted as markup (e.g., elements and attributes).

+

+ + + + + + +

+

A markupitem individual is a collection (c:Set, c:Bag, or c:List, where the latter is a subclass of the second one, and all of them are subclasses of c:Collection) of individuals belonging to the classes MarkupItem and Range. Through these collections, it is possible to define a markup item as a set, a bag, or a list of other markup items, using the properties element (for sets) and item and itemContent (for bags and lists). Thus, it becomes possible to define elements containing nested elements or text, or attributes containing values, as well as overlapping and complex structures. Note also that handling collections directly in OWL allows us to reason about content models for markup items, which would not be possible if we had used the corresponding constructs in RDF. + 13 + +

+ http://hcklab.blogspot.com/2008/12/moving‐towards‐swan‐collections.html +

+ +

+

A markupitem also might have a name, specified in the functional property hasGeneralIdentifier (recalling the SGML term to refer to the name of elements; Goldfarb, + 1990), and a namespace, specified using the functional property hasNamespace. Note that we can have anonymous markup items—as is possible in LMNL (Tennison & Piez, + 2002) and in GODDAG (Sperberg‐McQueen & Huitfeldt, + 2004)—by simply asserting that the item belongs to the class of all those markupitems that do not have a general identifier (i.e., hasGeneralIdentifier exactly 0).

+
+
+ Shell Classes +

The ghost classes discussed so far give us an abstract picture of the EARMARK framework. We need to specialize our model, defining a concrete description of our classes. These new shell subclasses apply specific restrictions to the ghost classes.

+

First, the class Docuverse is restricted to be either a StringDocuverse (i.e., the content is specified by a string) or a URIDocuverse (i.e., the actual content is located at the URI specified).

+

+ + + + + + +

+

Depending on particular scenarios or on the kind of docuverse we are dealing with (plain‐text, XML, LaTeX, a picture, etc.), we need to use different kinds of ranges. Therefore, the class Range has three different subclasses: + + +

PointerRange defines a range by counting characters. In that case, the value of the properties' begins and ends must be a nonnegative integer that identifies unambiguous positions in the character stream, remembering that the value 0 refers to the location immediately before the first character, the value 1 refers to the location after the first character and before the second one, and so on. By using the hasKey OWL property, we also assert that two pointer ranges having equal docuverse and begin and end locations are the same range.

+ + +

XPathRange defines a range considering the whole docuverse or its particular context specifiable through an XPath expression (Berglund et al., + 2007) as value of the property hasXPathContext. Note that by using these ranges, we implicitly admit that the docuverse it refers to must be an XML structure. Moreover, the properties' begins and ends have to be applied on the string value obtained by juxtaposing all the text nodes identified by the XPath. By using the hasKey OWL property, we also assert that two xpath ranges having equal docuverse, XPath context, and begin and end locations are the same range.

+
+ +

XPathPointerRange is an XPathRange in which the value of the properties' begins and ends must be a nonnegative integer that identifies unambiguous positions in the character stream as described for the class PointerRange.

+
+ +

+

+ + + + + + +

+

MarkupItem is specialized in three disjointed subclasses—Element, Attribute, and Comment—that allow a more precise characterization of markup items.

+

+ + + + + + +

+
+
+ Range and Markup Item Overlap +

The presence of overlap in EARMARK is worth discussing in more detail. Different types of overlap exist, according to the subset of items involved, and different strategies are needed to detect them. In particular, there is a clear distinction between overlapping ranges and overlapping markup items.

+

By definition, overlapping ranges are two ranges that refer to the same docuverse, so that at least one of the locations of the first range is contained in the interval described by the locations of the second range (excluding its terminal points). Totally overlapping ranges have the locations of the first range completely contained in the interval of the second range, or vice versa, while partially overlapping ranges have either exactly one location inside the interval and the other outside or identical terminal points in reversed roles.

+

Thus, if we consider the following excerpt:

+

+ + + + + + +

+

we can infer, through a reasoner such as Pellet, that these two ranges overlap by using the following rules:

+

+ + + + + + +

+

where P is one of: + + +

lessThan(b1,e1) ˆ greaterThan(b2,b1) ˆ lessThan(b2,e1)

+ + +

lessThan(b1,e1) ˆ greaterThan(e2,b1) ˆ lessThan(e2,e1)

+
+ +

lessThan(e1,b1) ˆ greaterThan(b2,e1) ˆ lessThan(b2,b1)

+
+ +

lessThan(e1,b1) ˆ greaterThan(e2,e1) ˆ lessThan(e2,b1).

+
+ +

+

The case of overlapping markup items is slightly more complicated. We define that two markup items A and B overlap when at least one of the following sentences holds: + + +

[Overlap by range]: A contains a range that overlaps with another range contained by B.

+ + +

[Overlap by content hierarchy]: A and B contain at least a range in common.

+
+ +

[Overlap by markup hierarchy]: A and B contain at least a markup item in common.

+
+ +

+

The three possible scenarios for such item overlap are summarized in Figure + 2. + 14 + +

The EARMARK documents describing these three overlapping scenarios and all the other ones presented in the following sections are available at + http://www.essepuntato.it/2011/jasist/examples +

+ +

+
+ + + + + + + + +

Three EARMARK examples of overlapping between elements p. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The EARMARK ontology, in fact, is completed by another ontology + 15 + +

+ http://www.essepuntato.it/2011/05/overlapping +

+ + that models all overlapping scenarios, either for ranges or markup items, and includes rules for automatically inferring overlaps through a reasoner.

+
+
+ EARMARK as a Standoff Notation +

If we ignore for a moment the semantic implications of using EARMARK and concentrate on its syntactical aspects only, it is easy to observe that EARMARK is nothing but yet another standoff notation, where the markup specifications point to, rather than contain, the relevant substructure and text fragments.

+

Standoff notations, also known in literature as out‐of‐line notations (TEI Consortium, + 2005), are hardly new, but never really caught on for a number of reasons, most having to do with their perceived fragility under the circumstances of desynchronized modification to the text. In Georg, Schonefeld, Trippel, and Witt ( + 2010) and Bański ( + 2010), we can find a pair of recent and substantially complete analyses of their merits and demerits. In particular, according to Georg, Schonefeld, Trippel, and Witt ( + 2010), “standoff annotation has … quite a few disadvantages: + + +

very difficult to read for humans

+ + +

The information, although included, is difficult to access using generic methods.

+
+ +

Limited software support as standard parsing or editing software cannot be employed.

+
+ +

Standard document grammars can be used only for the level which contains both markup and textual data.

+
+ +

New layers require a separate interpretation.

+
+ +

Layers, although separate, often depend on each other.” + 16 + +

To individually address the issues, we edited the original bullets into a numbered list.

+ +

+
+ +

+

And yet, although EARMARK is in practice a standoff notation, it provides a number of workarounds to most of the aforementioned issues.

+

First, since EARMARK is based on OWL and can be linearized in any of the large number of OWL caricaturization syntaxes, it follows that (a) readability, (b) access, and (c) software support for it are exactly those existing for well‐known, widespread, and important W3C standards such as RDF and OWL. Being able to employ common RDF and OWL tools such as Jena and SPARQL for EARMARK documents was in fact a major motivation for it.

+

Issue 4 should be examined beyond the mere validation against document grammars and toward a general evaluation of the level of compliancy of the markup to some formally specified expectations. EARMARK documents, while being subject to no document grammar in the stricter XML sense, allow the specification of any number of constraints, expressed either directly in OWL or SWRL (Horrocks et al., + 2004), or even in SPARQL, that trigger or generate validity evaluations. In Di Iorio, Peroni, and Vitali (in press), we tried to show that a large number of requirements, from hierarchical well‐formedness in the XML sense, to validation requirements in terms of XML DTDs, to adherence to design patterns, can be expressed satisfactorily using these technologies.

+

Issue 5 regards the difficulty of standoff notations to provide interlayer analysis on XML structures: Separate interpretation of markup layers is easy, but identification and validation of overlapping situations are more complex: Standoff markup is mainly composed of pointers to content and does not have any direct way to determine overlap locations without some kind of pointer arithmetics to compute them. Validation of contexts allowing overlaps as describable using rabbit/duck grammars (Sperberg‐McQueen, + 2006) also is not trivial. In this regard, EARMARK provides yet again a solution that does not require special tools: Although OWL does not allow direct pointer arithmetics, SWRL on the contrary does, as shown earlier where we described a batch of (SWRL‐implementable) rules that do, in fact, determine overlapping locations on EARMARK documents with good efficiency.

+

Finally, Issue 6 refers to the fact that evolution of separate markup annotation layers needs to synchronously take place, lest one of them becomes misaligned with the new state of the document. This is, in summary, the fragility of pointers, which can be considered the fundamental weakness of standoff, as well as of any notation that has markup separate from its content: If a modification occurs to the underlying (probably text‐based) source, all standoff pointers that could not be updated at the same time of the change become outdated and possibly wrong. All standoff notations fall prey to this weakness, and there is no way to completely get rid of it.

+

What is possible is to identify exactly the conditions under which such weakness acts, and see if there is a way to reduce the mere frequency of such events. In fact, for a standoff pointer to become outdated, several conditions must take place at the same time: + + +

The standoff notation must be used as a storage format, rather than just as a processing format;

+ + +

the source document must make sense even without the additional standoff markup (i.e., the standoff notation contains no information that is necessary for at least some types of document modifications);

+
+ +

the source document must be editable (and, in fact, must be edited) on its own;

+
+ +

the standoff pointers must rely on positions that change when the source is edited (e.g., character‐based locations);

+
+ +

editing must be done in contexts and with tools that cannot or do not update the standoff pointers; and

+
+ +

there must be no computable way to determine the modifications of the document (e.g., via a diff between the old and new versions).

+
+ +

+

Of course, no standoff notation can rule out that these conditions occur on their documents, but note that all six of them must occur for standoff pointers to become outdated. EARMARK is not safe from these occurrences either, but at least for the use cases here described, one or more of these conditions simply do not apply: EARMARK is mostly used as a processing format, with no need to save it on disk (Conversion from the source formats such as MS Word is described later and does not require special storage.), the data format described is either in a very specific format (e.g., MS Word or ODT) that in fact already does handle internally its data changes and requires the overlapping data exactly for this purpose, or is in fact the result of a diff action on successive versions of a document (as in the case of the wiki pages). Finally, EARMARK allows references to relatively stable fragment ids of the documents (by using XPath ranges without specifying explicitly begin and end locations) rather than the extremely fragile character locations, further reducing the chances of outdated pointers. For this reason, without being able to completely rule out the possibility of standoff pointers going wrong, we tend to consider it as a significantly little risk, at least for the use case described here.

+
+
+ Using OWL Versus RDF for Standoff Notations +

EARMARK is strongly based on OWL 2 DL (W3C OWL Working Group, + 2009) to express multiple markup layers with possible overlapping ranges over the same content. OWL 2 DL is not the only possible choice for expressing standoff notations via Semantic Web technologies. In fact, RDF is another valid and effective model for dealing with the same issue, as shown in Tummarello et al. ( + 2005), by means of the open‐source application programming interface (API) RDF Textual Encoding Framework (RDFTef). This API was created to demonstrate a plausible way for handling overlapping markup within documents and identifying textual content of a document as a set of independent RDF resources that can be linked mutually and with other parent resources.

+

Besides giving the possibility to define multiple structural markup hierarchies over the same text content, the use of RDF as the language for encoding markup allows to specify semantic data on textual content as well. But the real main advantage in using RDF is the possibility of using particular built‐in resources appositely defined in the RDF syntax specification (Beckett, + 2004) for describing and dealing with different kinds of containers, either ordered (rdf:Seq) or unordered (rdf:Bag). Thus, RDF resources can be used to represent every printable element in the text—words, punctuation, characters, typographical symbols, and so on—while RDF containers also can be used to combine such fragments and containers.

+

Although RDF is not sufficient to define a formal vocabulary for structural markup, does a given resource represent an element, an attribute, a comment, or a text node? In which way is a resource of a certain type related to others? The specification of an RDFS (Brickley & Guha, + 2004) or of an OWL layer can successfully address these issues. Hybrid solutions obtained by mixing different models, even when they are built one upon another, may seem elegant, but not necessarily the best choice. In fact, there exist well‐known interoperability limits between OWL 2 DL and RDF that prevent the correct use of Semantic Web tools and technologies. In particular: + + +

Any markup document made using RDF containers (e.g., to describe what markup items contain and in which order) and OWL ontologies (e.g., to define classes of markup entities and their semantics) results in a set of axioms that end up outside of OWL DL and well within OWL Full, which limits the applicability of the most frequently used Semantic Web tools that are usually built upon the (computationally tractable) description logic underlying OWL 2 DL.

+ + +

The individual analysis of each language may be not applicable when we have to check particular properties that lie between RDF and OWL layers. For example, verifying the validity of a markup document against a particular schema, which is one of the most common activities with markup, needs to be made to work with both markup item structures (that would be defined in RDF) and logical constraints about classes of markup items (e.g., elements only, attributes only, the element “p,” all the element of a particular namespace, etc., all of them definable in OWL).

+
+ +

+

Being able to express everything we need directly in OWL quite straightforwardly addresses both issues. The well‐known absence of containers and sequences in OWL can be overcome by modeling classes in specific ways using specific design patterns such as those in Ciccarese et al. ( + 2008) and in Drummond et al. ( + 2006).

+
+
+
+ Using EARMARK +

There are multiple applications for the EARMARK approach. The most interesting for this article is its capability of dealing with overlapping structures in an elegant and straightforward manner. Under EARMARK, such structures do not need to be specified through complex workarounds as with XML, but they are explicit and can be easily described and accessed. Sophisticated searches and content manipulations become very simple when using this ontological model.

+

The goal of this section is to demonstrate the soundness and applicability of EARMARK by discussing how the use cases presented earlier are addressed. Note that throughout the section we investigate multiple EARMARK data structures and documents, focusing on the feasibility and potentiality of such an ontological representation.

+
+ Looking for Authorial Changes in Office Documents +

The discussion about change tracking in office document formats showed that both ODT (OpenOffice format) and OOXML (Microsoft Word format) use complex data structures to store overlaps generated by change‐tracking functionalities. These structures make it very difficult to search and manipulate the content when using XML languages and tools. Even very simple edits generate a rather tangled set of overlapping elements.

+

Let us recall the example mentioned earlier where the user “John Smith” splits a single paragraph into two. The ODT representation is:

+

+ + + + + + +

+

The OOXML representation (shown earlier) is even more complex. In fact, these formats exploit in large scale (tangled) fragmentation (OOXML) or milestones and stand‐off markup (ODT) to deal with overlaps.

+

EARMARK, on the other hand, stores overlapping data in a direct and streamlined manner that does not require tools to rebuild information from the twists of a tree‐based XML structure. The information already is available and expressed through consistent RDF and OWL statements. Figure + 3 graphically shows the corresponding EARMARK document.

+
+ + + + + + + + +

Encoding in EARMARK the ODT change‐tracking example. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The original paragraph content and the new string “also” are now encoded as two docuverses over which the ranges r1, r2, and r3 are defined. The original paragraph is then composed of the (content of) ranges r1 and r2 while the paragraphs resulting after the (text and carriage return) insertion now comprise range r1 and ranges r2 and r3, respectively. Metadata about the author and the modification date are encoded as further RDF statements.

+

+ + + + + + +

+

The advantages of streamlining overlaps becomes apparent if we consider tasks a little beyond the mere display. For instance, the query for “the textual content of all paragraphs inserted by John Smith” ends up rather entangled if we used XPath on the ODT structure. The process for finding that textual content needs to browse the current version of the document and look for all the text:change‐start/text:change‐end pairs that refer to an insertion made by John Smith involving the creation of a new paragraph (i.e., text:change‐start is in a first paragraph while its pair, text:change‐end, is in the following one) that are either currently present in the document body or hidden behind a subsequent deletion made by someone else. Once the paragraphs are identified, we need to retrieve the content that originally was contained there (i.e., the text fragments that still are within those boundaries or that may have been deleted in subsequent versions). The following XPath represenst an implementation of this process:

+

+ + + + + + +

+

The XML structure of an MS Word file, using segmentation rather than milestones, does simplify the query a bit, but still presents some radical complexities. The process starts by choosing all those w:p elements that were inserted by John Smith as well as all their previous and contiguous w:p elements that were deleted before or inserted after the first ones. In OOXML, each sequence of contiguous w:p elements implicitly represents one paragraph. Therefore, we can now take all the text fragments contained in each w:p sequence that were inserted before or deleted after the paragraph defined by the sequence itself. The following is the resulting XPath for an OOXML document.

+

+ + + + + + +

+

The complexity of both XPath queries is due to the intrinsic complexity of the data structure on which the query has to work. Although the interface of OpenOffice or MS Word may provide tools to directly deal with these queries using specific strategies on the internal data structures, applications working directly on the XML structure have very little help in disentangling the mess of the data formats.

+

On the other hand, since EARMARK documents are actually OWL files, it is possible to access and query them with plain Semantic Web tools. Powerful searches then can be performed without using niche‐specific tools or complex and long XPath expressions simply with mainstream technologies such as SPARQL (Garlik & Seaborne, + 2010).

+

The corresponding SPARQL query for (“the textual content of all paragraphs inserted by John Smith”) therefore can be written as follows:

+

+ + + + + + +

+

But EARMARK is useful for even more than querying: EARMARK also decreases the costs, in terms of efforts and lines of code, for manipulating documents.

+

Let us consider the task of generating an intermediate version (i.e., neither the first nor the last one of a version chain) from a document that includes change‐tracking information about the whole document history.

+

The process of rebuilding these versions by working on the XML structure without specific APIs is both complex and inefficient. For example, a basic XSLT that returns an XML document defining the desired version requires to at least: + + +

define templates for all the elements actively involved in the change tracking (e.g., for ODT, text:changed‐region, text:change‐start, text:change‐end, and text:change and similarly for OOXML) to understand, by looking at their creation date, whether they must be considered or ignored when building the requested version. In particular, we must exclude insertions following and deletions preceding the version we are building;

+ + +

define templates for paragraphs to handle cases where the paragraph is the result of an insertion or a deletion of other paragraphs to identify whether it should be considered for the result and, in such case, finding out its real text content and remembering that in the following versions, such content may have spread out among other paragraphs;

+
+ +

define templates for handling insertions/deletions for structures such as images, sections, lists, and tables; and

+
+ +

define an identity template for the other elements to visit the entire document.

+
+ +

+

Even the most basic and incomplete implementation of such XSLT requires hundreds of lines of complex and convoluted code and a large number of ad hoc decisions based on the specificities of whether we start from ODT or OOXML. Note also that a Java‐based implementation (or in any other procedural language) of the same process would be equally or even more complex.

+

The same result can be achieved on EARMARK documents with a few lines of Java code:

+

+ + + + + + +

+

This approach uses the EARMARK Java API + 17 + +

+ http://earmark.sourceforge.net +

+ + and a single SPARQL query, runnable on any SPARQL 1.1 processor such as Jena, to identify the root node of the subtree of the version that is associated with the specified date and creator. Then, it performs a simple, recursive, deep‐first visit to clone all the nodes in the tree and to combine them in the output EARMARK document.

+

This method heavily uses Semantic Web technologies on the structures provided by EARMARK whose characteristics are always explicit and clear. In fact, since all versions coexist within the EARMARK document and each version can be encoded explicitly as a tree within the overall graph, this operation is straightforward and fast.

+
+
+ Improving Semantic Annotations +

EARMARK also can be exploited to improve semantic annotations. As noted earlier, there are in fact strong limitations in the same process of annotating web documents with semantic structures that overlap the structural ones. The same example—of vcards that cannot be created on the top of tables organized per rows—will be used in this section.

+

We solve this by converting the web document with annotations into an EARMARK document, allowing both semantic and structural annotations to coexist. Through EARMARK, we can explicitly express both markup structures and vcard assertions. Figure + 4 shows how the vcard example can be modeled (Once again, we show a graphical representation for the sake of clarity.)

+
+ + + + + + + + +

The abstract model of the EARMARK document solving the microformats issue. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The textual content of the original table cells is now encoded in two different docuverses: one for the header (with roles) and one for the body (with names of committee members). Ranges r1, r2, …, r8 are then created to distinguish each role and name. Two independent and coexisting hierarchies are then built on top of the same set of ranges: the HTML table that includes one cell for each range (in blue) and the vcards about each person (in green) that include only the relevant ranges and overlap the previous one. Note also that the vcards are defined in such a way that does not interfere with the structural features of the table. The full linearization in OWL of this example can be found at + http://www.essepuntato.it/2011/jasist/examples +

+
+
+ Improving Wiki Content Reversions +

EARMARK can be used to improve wiki reversion mechanisms and overcome the limitations discussed earlier: The automatic filtering and merging of contributions from multiple versions of the same page are still a manual process, but it can be fully automatized if the overlapping structures buried in the whole history of the page become explicit.

+

The role of EARMARK is to make those structures explicit and available for more sophisticated content manipulation. To understand the extent EARMARK structures can be derived from wikis and how they can be exploited by the final users, we use as our example the wiki platform MediaWiki + 18 + +

+ http://www.mediawiki.org +

+ + (i.e., the wiki engine of Wikipedia).

+

MediaWiki offers sophisticated functionalities for creating diffs of wiki content. Users can compare any two revisions in the page history and highlight changes in a friendly interface that shows modifications with a word‐level granularity. Diff pages contain metadata about each compared version (when the version was created, who the author was, or which IP address an anonymous author was connected from, etc.) and a two‐column table showing the changes side by side. Changes are detected a posteriori by comparing two arbitrary versions, which are not even required to be temporally contiguous.

+

The output of the MediaWiki diff engine has regularities that can be exploited to automatically build the overlapping structures of the diff and to express them in EARMARK. Let us consider a fictitious example summarized in Table + 1, where an initial text is revised three times by different authors.

+ + + All the versions of a wiki page modified by different authors. + + + + + + + + + + Version Author + V1 151.61.3.122 + V2 Angelo Di Iorio + V3 Silvio Peroni + V4 Fabio Vitali + + + + + Content + Bob was farming carrots and tomatoes. + Bob was farming carrots, tomatoes and beans. + Bob was farming carrots, tomatoes and green beans. They were all tasteful. + Bob was farming carrots, tomatoes and green beans. [new paragraph] They were all tasteful. + + + +
+
+

To display the differences between V1 ad V2, Mediawiki creates a page whose HTML code is as follows: + 19 + +

For the sake of clarity, we removed all markup irrelevant to our discussion.

+ +

+

+ + + + + + +

+

This is an HTML table of two rows, the first showing metadata (date and author of the modification), and the second showing the actual modifications. The first cell of the second row contains all the unmodified text and a del element for each inline fragment that was deleted. The second cell contains all the unmodified text and an ins element for each inline fragment that was inserted. Thus, these cells share exactly the same unmodified part(s) of the two compared versions.

+

When the structure itself is modified rather than merely the text, the source code of the MediaWiki diff is slightly different. Thus, the diff between V3 and V4 (which splits a paragraph in two) is as follows:

+

+ + + + + + +

+

The diff output is not complete or sophisticated, and of course, it is a completely different task to replan such an algorithm (but for a first idea of natural changes in diffing XML documents, see Di Iorio, Marchetti, Schirinzi, & Vitali, + 2009). Thus, limitations of that algorithm are inevitably shared by any EARMARK representation. Yet, this output is sufficiently rich to allow us to extract the overlapping information we need. For instance, the insertion of a nonbreaking space or a carriage return generates rows according to specific rules that can be easily detected to capture the actual change by the author.

+

Figure + 5 shows the aforementioned example rebuilt in EARMARK. All versions are encoded in the same document by creating overlapping assertions over the docuverses. Metadata and RDF statements are layered on top of those assertions and create a rich knowledge‐base about the history of the documents and, in particular, about the history of each fragment.

+
+ + + + + + + + +

The wiki sample versions encoded in a single EARMARK document. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

Due to the complexity of the example, we labeled arrows with numbers that indicate the position of each range within each markup item. Consider, for instance, Version V4: It is composed of two DIV elements, the first one containing the concatenation of “Bob was farming carrots” + “,” + “tomatoes” + “and” + “green” + “beans” + “.” and the second one containing the string “They are all tasteful.”

+

Implementing a wiki content‐filtering mechanism on top of such a structure is rather simple. For instance, the removal of all the contributions of “Angelo Di Iorio” that leaves untouched all the content written (previously and subsequently) by “Silvio Peroni” and “Fabio Vitali” can be performed straightforwardly. Three steps are enough to apply such an intermediate content reversion: + + +

the identification of the fragments written by “Angelo Di Iorio,” which is a straightforward SPARQL query on the embedded statements;

+ + +

the creation of a new version where references to those fragments are removed and references to fragments no longer in the document are correctly fixed;

+
+ +

the translation of that document into an actual MediaWiki page through the serialization process described in Peroni and Vitali ( + 2009).

+
+ +

+

Of course, an automatic process may generate ambiguities or even errors in the resulting content (i.e., some parts may become dangling, wrong, or unclear after removing text fragments elsewhere); grammar discrepancies also might be generated by the same approach. Linguistic and semantic problems, however, become a problem once the technical issues of managing independent, yet successive, edits are solved. What is important is that all the information about overlaps and dependencies among fragments is available in EARMARK and can easily be searched, filtered, and manipulated. Besides, foreseeing a manual intervention for checking and polishing automatically filtered content is perfectly in line with the wiki philosophy, so that the wiki community itself can wisely use the reversion tools to revise the content and adjust any intervening minor nuisances or imperfections. Such checks would still be far simpler and faster than would the manual process of partially reverting versions as we have today.

+
+
+
+ Generating EARMARK From Existing Documents: The ROCCO Approach +

Since we do not expect documents to be natively written in EARMARK or manually created by users, we need a way to extract EARMARK data structures from existing XML‐based resources, which is trivial when the XML is simple and clearly hierarchical and slightly more complex when the XML contains workarounds to force an intrinsically overlapping situation into a single hierarchy.

+

We designed a reliable process to transform XML files into EARMARK documents that fully captures overlapping structures even when the overlaps are hidden in one the many well‐known workarounds. This approach takes as input an XML file and produces the corresponding EARMARK document in five steps: Read, Overhaul, Convert, Classify, and Organize (hence, the name ROCCO).

+

Since ROCCO is not the main topic of this article, we very briefly discuss the issue of converting XML into EARMARK, explaining how each step works. The ROCCO algorithm performs five steps, described next.

+
+ Read and Overhaul +

The first two steps consist of loading the XML source file and, if needed, adding information useful for further processing. In EARMARK, there is a clear distinction between the textual content of a document and the structures built on top of it: The content is stored as plain text—within docuverses—and all structures are externalized and expressed through OWL and RDF assertions.

+

While OpenOffice stores all overlapping structures in the main document file, some other editors (e.g., MS Word) store overlaps in many different ways, even in a separate file. The overhaul step extracts such data and adds them to the main content document by exploiting format‐specific procedures, implemented via XSLT in most cases.

+
+
+ Convert +

The subsequent step consists of converting the XML source file into an early EARMARK document that expresses exactly the same information and hierarchies. No interpretation or disentanglement of workarounds is performed at this step.

+

Since the input is XML, this translation can be performed directly via a generic XSLT stylesheet. It basically consists of a recursive algorithm that parses the source file and generates the corresponding instances in the EARMARK ontology. Such a translation is straightforward and not difficult.

+
+
+ Classify +

The “Classify” step extends the EARMARK document built so far with information about the workarounds used to encode overlaps. That information will be exploited in the subsequent steps to make those overlaps explicit.

+

The basic idea is to exploit OWL reasoners to detect workarounds in an early EARMARK document D by: + + +

defining an ontology O that models all the workarounds used by applications, such as milestones, stand‐off markup, etc.; these workarounds are specific to the data format used in the source document;

+ + +

specifying the EARMARK document D as an ABox for the ontology O;

+
+ +

defining SWRL rules that capture the role of each element in D and check relationships between elements;

+
+ +

running an OWL reasoner, such as Pellet, on D+O to create new OWL instances and properties that identify which workarounds are present.

+
+ +

+

The actual detection of workarounds is delegated to an external reasoner. Refining detection strategies and even adding new strategies for new formats all can be done via OWL and SPARQL. Indeed, tricky issues need to be addressed—mostly depending on the idiosyncrasies of the original formats—but no procedural code is required.

+
+
+ Organize +

The final step consists of building yet another EARMARK document that expresses the overlaps and metadata in an explicit way, based on the information collected by the previous steps. This phase consists of mapping operations from the native format into the EARMARK structure. Such conversion relies on the identification of metadata to classify the operations and to externalize relevant metadata in separate RDF statements.

+
+
+
+ Evaluating EARMARK +

One of the most frequent criticisms when proposing a different approach to solving a well‐known problem in information and communication technology is that the new solution may simplify the difficulties of the specific problem, but brings with it hidden costs in terms of size of the data structure, computation efforts, or conversion restrictions that compensate the advantages. In our case, one of the anonymous reviewers of our article (Di Iorio, Peroni, & Vitali, + 2009) wondered whether a difference in file size could weigh in on the convenience of adopting EARMARK as opposed to working with the original files.

+

As such, a discussion of cost functions of EARMARK versus other formats is in order. Yet, a systematic discussion of the relative costs (e.g., in byte size) of some original XML‐based data structures versus their EARMARK equivalent is an open‐ended undertaking that heavily depends on the original XML data structure and the specific features present in the document, and is badly defined anyway: While XML is a linearization format immediately expressible in actual bytes, OWL (or more precisely, RDF, the language in which OWL ontologies are expressed) is an abstract structure that allows a large number of linearization formats (including XML itself) with corresponding huge differences in the final byte counts.

+

For these reasons, to provide at least an initial test of meaningful concepts, we selected two XML‐based data formats (OOXML and ODT) and, specifically, a set of documents where overlapping tricks were present (i.e., where change‐tracking was active). To bypass the size discussion, we decided not to test byte lengths (which are not meaningful and easily skewed, e.g., by reducing the string length of the element names or of the class names) but the number of nodes for XML documents and of triples for OWL documents. This comparison again is not particularly appropriate (Triples are naturally numerous in OWL ontologies, and it is customary to deal with hundreds of thousands and even millions of assertions in Semantic Web applications.) but closer to meaningfulness than is the mere byte count.

+

Our comparison was carried on a small set of documents in ODT and OOXML that included change‐tracking information. As discussed in the previous sections, change‐tracking facilities generate rather complex overlaps even for basic operations on small text fragments, which in turn are expressed as a potentially huge number of standoffs and milestone markup within the XML hierarchy. The same documents were individually converted into EARMARK. We then charted how simple edits under change‐tracking affect the number of nodes in XML formats and of statements in OWL files. + 20 + +

The full details about each version and each format also are available at + http://www.essepuntato.it/2011/jasist/discussion +

+ +

+

We created seven different versions, named after the “Seven Dwarfs” for recognizability, by applying very common edits (e.g., the insertion of few words, the deletion of some sentences, the split of a paragraph, etc.) on a small document, creating multiple overlaps. Figure + 6 shows the results of our comparison.

+
+ + + + + + + + +

A graph summarizing the results of the first experiment. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The overall trend is interesting and comforting: While in simple documents with no overlap the node count of XML is lower than is the assertion count of EARMARK triples, the presence of overlaps makes EARMARK and XML formats comparable. The growth of EARMARK statements is in fact very close to the growth of XML nodes when the number of overlaps increases. EARMARK is even more efficient than is XML for more complex documents.

+

The measure for each format was done by counting only those nodes and statements instrumental to encode content and (overlapping) structures: We did not take into account either the presentational information for ODT and OOXML (Each file, for instance, includes a very long list of style definitions that are not relevant for the purposes of our analysis.) or namespace declarations (OOXML files, for instance, list all relevant namespaces for the Office toolkit.) or ignorable white spaces (that are only added to indent content and improve readability).

+

Interestingly, EARMARK and ODT show a very similar increase in size while OOXML is much more verbose and grows faster. The content of the first version, for instance, is encoded using four nodes in ODT, 13 statements in EARMARK, and 54 nodes in OOXML; the last one contains 241 ODT nodes, 233 EARMARK statements, and 452 OOXML nodes. To return to our original inquiry, it is clear that the weight of EARMARK documents is very good compared to the other ones.

+

Also note the regularity in the growth of EARMARK statements. Regardless of the actual modifications applied to the document, in fact, EARMARK adds about 40 statements for each edit. Both OOXML and ODT, on the contrary, show a more irregular “pace.” The reason for this is that EARMARK externalizes all assertions, so that all modifications (either to leaf nodes or to intermediate nodes in the original XML) are “flattened” onto the docuverses and do not depend on the complexity of the structure within which the edit took place.

+

Figure + 7 shows the results of a similar comparison on a different set of documents and edits. We collected seven versions named after the days of the week and created by seven different authors when editing a very simple document. The overall trend does not change, and shows that EARMARK and ODT again have a comparable behavior, far better than that of OOXML.

+
+ + + + + + + + +

A graph summarizing the results of the second experiment. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

In conclusion, although preliminary, this study shows clear trends of a very conservative behavior of EARMARK with respect to document size.

+
+
+ Conclusions +

Overlaps, far from being an obscure requirement for sophisticated functionalities of arcane markup languages, are a very frequent undertaking even in major data formats and in rather frequent situations. Yet, since the XML language does not allow them, consciously or not, designers of data formats have adopted a huge and entangled array of tricks, special cases, and workarounds that, although solving the actual problem of storing overlapping structures, open new and complicated ones when approaching even basic chores on documents containing them, such as queries.

+

The EARMARK approach drastically reduces the efforts needed to perform such chores on overlapping structures since it does not allow the corresponding multiple trees to actually entangle and complicate the job. EARMARK is radically different from both special markup metalanguages that allow overlaps and the introduction of workarounds within the traditional tree‐oriented XML language because it treats multiple trees over the same content as first‐class citizens of the language, yet uses well‐known and standard W3C technologies and languages to perform all tasks. EARMARK documents, at the end, are OWL ontologies. Thus, any Semantic Web technology (e.g., SPARQL) can be used straightforwardly to perform operations on their content.

+

Improving queries is not the only application of EARMARK. Validation is another interesting field that we are investigating. In fact, the same ontological framework can be used to prove properties concerning a document, such as validity against a schema, compliance to co‐constraint specifications, or adherence to structural patterns. Moreover, inspired by Marcoux and Rizkallah ( + 2009), in which they described an approach for defining natural‐language semantics for XML‐based languages, we also are developing an ontology‐based approach for encoding markup semantics—that is, the formal definition of meanings of markup elements, besides the syntactical structure of a markup document—within EARMARK documents.

+
+ + References + + + + Adida, + B. + , + + Birbeck, + M. + , + + McCarron, + S. + , & + + Pemberton, + S. + + ( + 2008). RDFa in XHTML: Syntax and processing. W3C Recommendation, October 14, 2008, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/rdfa‐syntax/ + + + + + + Allsopp, + J. + + ( + 2007). + Microformats: Empowering your markup for Web 2.0. + New York, NY: + Friends of ED Press. + + + + + + Bański, + P. + + ( + 2010). + Why TEI stand‐off annotation doesn't quite work: And why you might want to use it nevertheless. In + Proceedings of Balisage: The Markup Conference 2010. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://www.balisage.net/Proceedings/vol5/html/Banski01/BalisageVol5‐Banski01.html + + + + + + Beckett, + D. + + ( + 2004). RDF/XML syntax specification (Rev.). W3C Recommendation, February 10, 2004, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/2004/REC‐rdf‐syntax‐grammar‐20040210/ + + + + + + Berglund, + A. + , + + Boag, + S. + , + + Chamberlin, + D. + , + + Fernández, + M.F. + , + + Kay, + M. + , + + Robie, + J. + , & + + Siméon, + J. + + ( + 2007). XML Path Language (XPath) 2.0. W3C Recommendation, January 23, 2007, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/xpath20/ + + + + + + Brickley, + D. + , & + + Guha, + R.V. + + ( + 2004). RDF Vocabulary Description Language 1.0: RDF Schema. W3C Recommendation, February 10, 2004, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/rdf‐schema/ + + + + + + Ciccarese, + P. + , + + Wu, + E. + , + + Kinoshita, + J. + , + + Wong, + G. + , + + Ocana, + M. + , + + Ruttenberg, + A. + , & + + Clark, + T. + + ( + 2008). + The SWAN biomedical discourse ontology. + Journal of Biomedical Informatics, + 41( + 5), + 739– + 751. + + + + + + DeRose, + S. + ( + 2004). + Markup overlap: A review and a horse. In + Proceedings of the Extreme Markup Languages 2004. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://conferences.idealliance.org/extreme/html/2004/DeRose01/EML2004DeRose01.html + + + + + + Di Iorio, + A. + , + + Marchetti, + C. + , + + Schirinzi, + M. + , & + + Vitali, + F. + + ( + 2009). + Natural and multi‐layered approach to detect changes in tree‐based textual documents. In + + J. + Cordeiro + & + + J. + Filipe + (Eds.), + Proceedings of the 11th International Conference on Enterprise Information Systems (ICEIS 2009) (pp. + 90– + 101). + Heidelberg, Germany: + Springer. + + + + + + Di Iorio, + A. + , + + Peroni, + S. + , & + + Vitali, + F. + ( + 2009). + Towards markup support for full GODDAGs and beyond: The EARMARK approach. In + Proceedings of Balisage: The Markup Conference 2009. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://balisage.net/Proceedings/vol3/html/Peroni01/BalisageVol3‐Peroni01.html + + + + + + Di Iorio, + A. + , + + Peroni, + S. + , & + + Vitali, + F. + + ( + 2010). + Handling markup overlaps using OWL. In + + P. + Cimiano + & + + H. S. + Pinto + (Eds.), + Proceedings of the 17th International Conference on Knowledge Engineering and Knowledge Management (EKAW 2010) (pp. + 391– + 400). + Heidelberg, Germany: + Springer. + + + + + + Di Iorio, + A. + , + + Peroni, + S. + , & + + Vitali, + F. + (in press). + Using Semantic Web technologies for analysis and validation of structural markup. + International Journal of Web Engineering and Technology. + + + + + + Drummond, + N. + , + + Rector, + A. + , + + Stevens, + R. + , + + Moulton, + G. + , + + Horridge, + M. + , + + Wang, + H.H. + , & + + Seidenberg, + J. + + ( + 2006). + Putting OWL in order: Patterns for sequences in OWL. In + + B. C. + Grau + , + + P. + Hitzler + , + + C. + Shankey + , & + + E. + Wallace + (Eds.), + Proceedings of the Workshop on OWL: Experiences and Directions (OWLED 2006), + Athens, GA. Retrieved from + http://sunsite.informatik.rwth‐aachen.de/Publications/CEUR‐WS/Vol‐216/submission_12.pdf + + + + + + Durand, + D.G. + + ( + 1994, October). + Palimpsest, a data model for revision control. Paper presented at the Workshop on Collaborative Editing Systems at the Computer Supported Cooperative Work Conference (CSCW94), Chapel Hill, NC. + + + + + + Durand, + D.G. + ( + 2008). + Palimpsest: Change‐oriented concurrency control for the support of collaborative applications. + Charleston, SC: + CreateSpace. + + + + + + Garlik, + S.H. + , & + + Seaborne, + A. + + ( + 2010). SPARQL 1.1 Query Language. W3C Working Draft, October 14, 2010, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/sparql11‐query/ + + + + + + Georg, + R. + , + + Schonefeld, + O. + , + + Trippel, + T. + , & + + Witt, + A. + + ( + 2010). + Sustainability of linguistic resources revisited. In + Proceedings of the International Symposium on XML for the Long Haul: Issues in the Long‐Term Preservation of XML. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://www.balisage.net/Proceedings/vol6/html/Witt01/BalisageVol6‐Witt01.html + + + + + + Goldfarb, + C.F. + ( + 1990). + The SGML Handbook. + New York, NY: + Oxford University Press. + + + + + + Horridge, + M. + , & + + Patel‐Schneider, + P. + + ( + 2009). OWL 2 Web Ontology Language: Manchester Syntax. W3C Working Group Note October 27, 2009, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/owl2‐manchester‐syntax/ + + + + + + Horrocks, + I. + , + + Patel‐Schneider, + P.F. + , + + Boley, + H. + , + + Tabet, + S. + , + + Grosof, + B. + , & + + Dean, + M. + + ( + 2004). SWRL: A Semantic Web rule language combining OWL and RuleML. W3C Member Submission, May 21, 2004, World Wide Web Consortium. Retrieved + http://www.w3.org/Submission/SWRL/ + + + + + + Huitfeldt, + C. + , & + + Sperberg‐McQueen, + C.M. + + ( + 2003). TexMECS: An experimental markup meta‐language for complex documents. Retrieved from + http://decentius.aksis.uib.no/mlcd/2003/Papers/texmecs.html + + + + + JTC1/SC34 WG 4. ( + 2008). ISO/IEC 29500‐1:2008—Information technology—Document description and processing languages—Office Open XML File Formats: Part 1. + Fundamentals and markup language reference. + Geneva, Switzerland: + International Organization for Standardization. + + + + + JTC1/SC34 WG 6. ( + 2006). + ISO/IEC 26300:2006—Information technology—Open document format for office applications (OpenDocument), Version 1.0. + Geneva, Switzerland: + International Organization for Standardization. + + + + + + Marcoux, + Y. + , & + + Rizkallah, + E. + + ( + 2009). + Intertextual semantics: A semantics for information design. + Journal of the American Society for Information Science and Technology, + 60( + 9), + 1895– + 1906. + + + + + + Marinelli, + P. + , + + Vitali, + F. + , & + + Zacchiroli, + S. + + ( + 2008). + Towards the unification of formats for overlapping markup. + New Review of Hypermedia and Multimedia, + 14( + 1), + 57– + 94. + + + + + + Nelson, + T. + ( + 1980). + Literary machines: The report on, and of, Project Xanadu concerning word processing, electronic publishing, hypertext, thinkertoys, tomorrow's intellectual ⋖ including knowledge, education and freedom. + Sausalito, CA: + Mindful Press. + + + + + + Peroni, + S. + , & + + Vitali, + F. + + ( + 2009). + Annotations with EARMARK for arbitrary, overlapping and out‐of order markup. In + + U.M. + Borghoff + & + + B. + Chidlovskii + (Eds.), + Proceedings of the 2009 ACM Symposium on Document Engineering (DocEng 2009) (pp. + 171– + 180). + New York, NY: + ACM. + + + + + + Portier, + P. + , & + + Calabretto, + S. + + ( + 2009). + Methodology for the construction of multi‐structured documents. In + Proceedings of Balisage: The Markup Conference 2009. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://balisage.net/Proceedings/vol3/html/Portier01/BalisageVol3‐Portier01.html + + + + + + Riggs, + K.R. + + ( + 2002). + XML and free text. + Journal of the American Society for Information Science and Technology, + 53( + 6), + 526– + 528. + + + + + + Salembier, + P. + , & + + Benitez, + A.B. + + ( + 2007). + Structure description tools. + Journal of the American Society for Information Science and Technology, + 58( + 9), + 1329– + 1337. + + + + + + Schmidt, + D. + + ( + 2009). + Merging multi‐version texts: A generic solution to the overlap problem. In + Proceedings of Balisage: The Markup Conference 2009. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://balisage.net/Proceedings/vol3/html/Schmidt01/BalisageVol3‐Schmidt01.html + + + + + + Schmidt, + D. + , & + + Colomb, + R. + + ( + 2009). + A data structure for representing multi‐version texts online. + Journal of Human–Computer Studies, + 67( + 6), + 497– + 514. + + + + + + Schonefeld, + O. + , & + + Witt, + A. + ( + 2006). + Towards validation of concurrent markup. In + Proceedings of the Extreme Markup Languages 2006. + Rockville, MD: + Mulberry Technologies. Retrieved from h + ttp://conferences.idealliance.org/extreme/html/2006/Schonefeld01/EML2006Schonefeld01.html + + + + + + Sperberg‐McQueen, + C.M. + + ( + 2006). + Rabbit/duck grammars: A validation method for overlapping structures. In + Proceedings of Extreme Markup Languages Conference 2006. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://conferences.idealliance.org/extreme/html/2006/SperbergMcQueen01/EML2006SperbergMcQueen01.html + + + + + + Sperberg‐McQueen, + C.M. + , & + + Huitfeldt, + C. + + ( + 2004). + GODDAG: A data structure for overlapping hierarchies. In + + P.R. + King + & + + E.V. + Munson + (Eds.), + Proceeding of the 5th International Workshop on the Principles of Digital Document Processing (PODDP 2000) (pp. + 139– + 160). + Heidelberg, Germany: + Springer. + + + + + TEI Consortium. ( + 2005). TEI P5: Guidelines for electronic text encoding and interchange. Retrieved from + http://www.tei‐c.org/Guidelines/P5 + + + + + + Tennison, + J. + , & + + Piez, + W. + + ( + 2002, August). The Layered Markup and Annotation Language (LMNL). Paper resented at the Extreme Markup Languages Conference 2002, Montreal, Canada. + + + + + + Tummarello, + G. + , + + Morbidoni, + C. + , & + + Pierazzo, + E. + + ( + 2005). + Toward textual encoding based on RDF. In + + M. + Dobreva + & + + J. + Engelen + (Eds.), + Proceedings of the Ninth ICCC International Conference on Electronic Publishing (ELPUB2005). + Leuven, Belgium: + Peeters. + + + + + W3C OWL Working Group. ( + 2009). OWL 2 web ontology language document overview. W3C Recommendation, October 27, 2009, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/owl2‐overview/ + + + + +
diff --git a/xml/wiley-sample.xml b/xml/wiley-sample.xml new file mode 100644 index 0000000..088e568 --- /dev/null +++ b/xml/wiley-sample.xml @@ -0,0 +1,1776 @@ + + +
+ + + Wiley Subscription Services, Inc., A Wiley Company + Hoboken + + 10.1002/(ISSN)1532-2890 + 1532-2882 + 1532-2890 + + + + + Journal of the American Society for Information Science and Technology + J. Am. Soc. Inf. Sci. + + + + Journal of the American Society for Information Science + 0002-8231 + 1097-4571 + 2000 + 51 + 14 + + + + + 10.1002/asi.v62.9 + + 62 + 9 + + September 2011 + + + 10.1002/asi.21591 + + + + + + + + Research Article + Research Articles + + © 2011 ASIS&T + + + + + + + + + + + + + 1696 + 1716 + + + data formats + overlap + markup languges + semantic web + data conversion + + + + + + + + + + + + + A Semantic Web approach to everyday overlapping markup + + + + + Angelo + Di Iorio + + + diiorio@cs.unibo.it + + + + + Silvio + Peroni + + + speroni@cs.unibo.it + + + + + Fabio + Vitali + + + fabio@cs.unibo.it + + + + + + Department of Computer Science, University of Bologna, Bologna, Italy + + + + + Abstract +

Overlapping structures in XML are not symptoms of a misunderstanding of the intrinsic characteristics of a text document nor evidence of extreme scholarly requirements far beyond those needed by the most common XML‐based applications. On the contrary, overlaps have started to appear in a large number of incredibly popular applications hidden under the guise of syntactical tricks to the basic hierarchy of the XML data format. Unfortunately, syntactical tricks have the drawback that the affected structures require complicated workarounds to support even the simplest query or usage. In this article, we present Extremely Annotational Resource Description Framework (RDF) Markup (EARMARK), an approach to overlapping markup that simplifies and streamlines the management of multiple hierarchies on the same content, and provides an approach to sophisticated queries and usages over such structures without the need of ad‐hoc applications, simply by using Semantic Web tools and languages. We compare how relevant tasks (e.g., the identification of the contribution of an author in a word processor document) are of some substantial complexity when using the original data format and become more or less trivial when using EARMARK. We finally evaluate positively the memory and disk requirements of EARMARK documents in comparison to Open Office and Microsoft Word XML‐based formats.

+
+
+
+
+ + +
+ Introduction +

The overwhelming consensus among XML practitioners is that documents are trees, the hierarchy is the fundamental data structure, and violations of the hierarchy are errors or unnecessary complications. Therefore, overlapping markup has received ambivalent, almost schizoid considerations in the field of markup languages. Traditionally, overlaps were the hallmarks of bad HTML coders and nave HTML page editors, taking advantage of the unjustified benevolence in web browsers that would display basically any HTML regardless of proper nesting. At the same time, far from the awareness of the general public, overlaps have been a fringe, almost esoteric, discipline of scholars in the humanities, competently used for arcane specifications of linguistic annotations and literary analysis.

+

Although the first type of overlap was judged with scorn and the second with awe, they both fundamentally represent a situation that is more common than was thought, and the scholars were only more aware, and not more justified, about the need to represent overlaps.

+

Generally, overlap is needed when multiple independent items refer to the same segment, either when considering textual markup documents or multimedia structures (Salembier & Benitez, + 2007). Regarding documents with markup, we need overlap whenever multiple markup elements need to be applied over the same content, and these elements happen to be independent of each other. In some (rather frequent) situations, this independence means that the content referred to by some elements is partially, but not completely, the same as the content referred to by other elements.

+

This situation is more frequent than it may appear: Not only do bad HTML code and arcane linguistic annotations use overlap, but many more mainstream and mundane examples exist. For instance, change tracking in an office document is often at odds with the underlying structure of the text, microformats (Allsopp, + 2007) and Resource Description Framework–in–attributes (RDFa; Adida, Birbeck, McCarron, & Pemberton, + 2008) annotations may need to refer to concepts that span across multiple XML elements, complex data structures (e.g., biological data) force graphs into trees and hide multiple parentage as internal references, and so on.

+

Differently from SGML, which is able to handle some overlapping scenarios through the CONCUR notation (Goldfarb, + 1990), XML grammatically imposes and requires a strict hierarchy of containment generating a single mathematical tree of the document where no overlap is allowed. This requirement has been turned into an intrinsic characteristic of the documents XML was meant to represent rather than a syntactical and conceptual constraint into which these documents need to fit. Thus, whenever authors needed to cope with independent markup elements, they managed either by navely ignoring the hierarchical limitation (and therefore creating invalid documents) or by creating careful workarounds within the syntactical constraint, or even by inventing completely new markup languages that allow some types of overlap. But while new multihierarchical markup languages such as TexMecs (Huitfeldt & Sperberg‐McQueen, + 2003) and LMNL (Tennison & Piez, + 2002) have a small number of adepts and applications, and while bad HTML coders and bad HTML page editors are disappearing from the market, the careful workarounds within the XML syntax (TEI Consortium, + 2005), such as segmentation, milestones, or standoff markup, are to this day frequently used and ubiquitous.

+

All workarounds share the same approach of hiding structural information about a secondary hierarchy under the guise of something else: split individual elements, empty boundary elements, indirect references, and so on. The result is that the secondary structural information is hidden or its importance is lessened to not break or obfuscate the main hierarchy expressed in the visible XML structure. But this comes at a price: Structures specified through workarounds are more difficult to find, identify, and act upon than the are structures in the main XML hierarchy. Thus, trivial searches that should amount to a short XPath in a more direct situation end up being multiple‐lines long, pretty basic visualizations require incredibly complex XSLT stylesheets, specific choices of the main markup hierarchy actually prevent some features of the secondary markup to even exist, and so on. So, although workarounds exist and can be used, hierarchies expressed through them are “second‐class citizens” that cannot fully exploit the sophisticated tools that the XML language provides.

+

In this article, we show how Extremely Annotational Resource Description Framework (RDF) Markup (EARMARK), our proposal for managing overlapping markup, does not generate first‐ and second‐class hierarchies, and allows existing, sophisticated tools to be used on all markup—even in the presence of overlaps. Rather than creating a completely new language requiring completely new tools and competencies, EARMARK uses Semantic Web technologies and Semantic Web tools to obtain many of the results obtainable with usual XML tools.

+

EARMARK defines markup vocabularies by means of OWL ontologies (W3C OWL Working Group, + 2009). Since each individual markup item is an independent assertion over some content or some other assertions, overlaps of content are not a problem, nor are all the issues connected to physical embedding and containments, such as contiguity and document order. Furthermore, by using standard Semantic Web technologies, fairly sophisticated functionalities can be provided over EARMARK documents.

+

Through EARMARK, operations that were previously very hard or impossible exactly because of the interferences of the multiple hierarchies or of the workarounds they employed now become fundamentally trivial since no syntactical tricks are employed and the different hierarchies do not interfere with each other. Thus, for instance, identifying the individual contributions in a multi‐authored MS Word or Open Office document is quite hard on their original XML formats, and becomes trivial when the same documents are converted into EARMARK.

+

This article is an extended version of previous works on EARMARK (Di Iorio, Peroni, & Vitali, + 2010; Peroni & Vitali, + 2009). In those works, we focused on identifying workarounds for overlapping data existing in real XML documents and translating them into EARMARK assertions. We also sketched the EARMARK ontology and presented a simple implementation of EARMARK‐aware tools. This article follows and extends them and also provides some novel contributions: + + +

The systematic analysis of the EARMARK model, with particular attention to data typing and overlapping structures

+ + +

The discussion of further applications for the ontological EARMARK approach. In particular, we show how EARMARK can be used to improve the content filtering and reversions mechanisms of wikis.

+
+ +

The brief description of a process, called ROCCO, for generating EARMARK documents from existing XML documents (even ones that use workarounds for overlapping structures)

+
+ +

An evaluation of EARMARK efficiency when dealing with multiple hierarchies in comparison with the XML structures used by popular XML‐based formats such as Office Open and MS Word.

+
+ +

+

The article is structured as follows: First, we provide a brief overview of existing approaches to overlap using workarounds in XML or ad hoc markup metalanguages, and then give a few examples of situations where overlaps are used today and sometimes in rather mainstream situations. Next, we present the EARMARK model and its rules. Then, we provide some use cases that are meant to demonstrate the superiority of the EARMARK approach to a traditional XML format, especially when overlaps come into question, and show the generation of EARMARK documents, converting legacy documents. An initial evaluation of the efficiency of EARMARK compared to popular XML based data formats such as Open Document (ODT) and Office Open XML (OOXML) is presented, followed by our onclusions.

+
+
+ Existing Approaches to Overlapping +

The need for multiple overlapping structures over documents using markup syntaxes such as XML and SGML is an age‐old issue, and a large amount of literature exists on the techniques, languages, and tools that allow users to create multiple entangled hierarchies over the same content. A good review can be found in DeRose ( + 2004).

+

Some research has proposed using plain hierarchical markup (i.e., XML) and employing specially tailored elements or attributes to express the semantics of overlapping in an implicit way. The TEI Guidelines (TEI Consortium, + 2005) presented a number of different techniques that use SGML/XML constructs to force multiple hierarchies into a single one, including: + + +

milestones (i.e., verlapping structures are expressed through empty elements to mark the boundaries of the “content”),

+ + +

fragmentation (i.e., overlapping structures are split into individual, nonoverlapping elements that may even be linked through id–idref pairs), and

+
+ +

standoff markup (i.e., overlapping structures are placed elsewhere and indirectly refer to their would‐be locations through pointers, locators, and/or id–idref pairs).

+
+ +

+

Given the large number of techniques to deal with overlapping structures in XML, in Marinelli, Vitali, and Zacchiroli ( + 2008), we presented a number of algorithms to convert XML documents with overlapping structures from and to the most common approaches, as well as a prototype implementation.

+

Riggs ( + 2002) introduced a slightly different technique for fragmentation within XML structures. In this proposal, floating elements (i.e., those elements that do not fall in a proper or meaningful hierarchical order) are created using the name of the element followed by an index referring to its semantically related parent element. For example, the floating element <name.person[2]>John</name.person [2] means that <name>John</name> is semantically a child of the second occurrence of the element person, even though the floating element is not structurally contained by its logical parent.

+

Other research even has proposed to get rid of the theory of trees at the base of XML/SGML altogether and use different underlying models and newly invented XML‐like languages that allow the expression of overlaps through some kind of syntactical flourishing. For instance, a general ordered‐descendant directed acyclic graph (GODDAG; Sperberg‐McQueen & Huitfeldt, + 2004) is a family of graph‐theoretical data structures to handle overlapping markup. A GODDAG's nodes represent markup elements and text. Arcs are used to explicitly represent containment and father–child relations. Since multiple arcs can be directed to the same node, overlapping structures can be straightforwardly represented in GODDAG. Full GODDAGs cannot be linearized in any form using embedded markup, but restricted GODDAGs, a subset thereof, can be and have been linearized into TexMecs (Huitfeldt & Sperberg‐McQueen, + 2003), a multihierarchical markup language that also allows full GODDAGs through appropriate nonembedding workarounds such as standoff markup.

+

LMNL (Tennison & Piez, + 2002) is a general data model based on the idea of layered text fragments and ranges, where multiple types of overlap can be modeled using concepts drawn from the mathematical theory of intervals. Multiple serializations of LMNL exist, such as CLIX and LMNL‐syntax.

+

XConcur (Schonefeld & Witt, + 2006) is a similar solution based on the representation of multiple hierarchies within the same document through layers. Strictly related to its predecessor CONCUR as it was included in the SGML, XConcur was developed in conjunction with the validation language XConcur‐CL to handle relationships and constraints between multiple hierarchies.

+

The variant graph approach (Schmidt & Colomb, + 2009) also is based on graph theory. Developed to deal with textual variations that generate multiple versions of the same document with multiple overlapping hierarchies, this theory proposes a new data model to represent literary documents and a graph linearization (based on lists) that scales well even with a large number of versions. The same authors recently presented an extension of their theory that also allows users to merge multiple variants into one document (Schmidt, + 2009). In Portier and Calabretto ( + 2009), a detailed survey about overlapping approaches was presented, and also discussed the MultiX2 data model, which uses W3C standard languages such as XInclude to link and fetch text fragments within overlapping structures, and a prototype editor for the creation of multistructured documents.

+

Tummarello, Morbidoni, and Pierazzo ( + 2005) proposed using RDF as a standoff notation for overlapping structures of XML documents. Since this proposal has many affinities with the one we are presenting in this article, we later discuss its characteristics and compare it with ours.

+
+
+ More Frequent Than One May Think: Overlapping in the Wild +

Overlapping structures have been considered often as appropriate only in highly specific contexts and basically for scholars: The solutions that have been proposed in the literature were complex since they were considered grounded in the intrinsic complexity of the topics themselves. Yet, overlapping structures can be found in many more fields than these, and even mainstream applications generate and use markup with overlapping structures. While the complexity of overlapping is hidden to the final user, applications that consume such data may very well find it rather difficult to handle such information. We next discuss three very different contexts where overlapping already exists and fairly relevant information is encoded in multiple independent structures, leaving to special code the task of managing the complexity.

+
+ Change Tracking in Office Document Formats +

Word processors such as Microsoft Word and Open Office provide users with powerful tools for tracking changes, allowing each individual modification by individual authors to be identified, highlighted, and acted upon (e.g., by accepting or discarding them). The intuitiveness of the relevant interfaces actually hides the complexity of the data format and of the algorithms necessary to handle such information.

+

For instance, the standard ODT format (JTC1/SC34 WG 6, + 2006) used by Open Office, when saving change‐tracking information, relies on two specific constructs for insertions and deletions that may overlap with the structural markup. Adding a few words within a paragraph is not in itself complex, as it does not require the breaking of the fundamental structural hierarchy; conversely, changes that affect the structure itself (e.g., the split of one paragraph in two by the insertion of a return character, or the joining of two paragraphs by the elimination of the intermediate return character) require that annotations are associated to the end of a paragraph and the beginning of the next, in an unavoidably overlapping pattern. ODT uses milestones and standoff markup for insertions and deletions, respectively, and also relies on standoff markup for annotations about the authorship and date of the change.

+

For instance, the insertion of a return character and a few characters in a paragraph creates a structure as follows:

+

+ + + + + + +

+

The empty elements <text:change‐start/> and <text:change‐end/> are milestones marking the beginning and the end, respectively, of the range that constituted the insertion while the element <text:insertion>, before the beginning of the document content, is standoff markup for the metadata about the change (author and date information).

+

Similarly, a deletion creates a structure as follows:

+

+ + + + + + +

+

The element <text:change/> represents a milestone of the location where the deletion took place in the content, and the corresponding standoff markup annotation <text:deletion> contains not only the metadata about the change but also the text that was deleted.

+

The OOXML format (JTC1/SC34 WG 4, + 2008) (the XML‐based format used by Microsoft Office 2007 and standardized by ISO in 2008), on the other hand, uses a form of segmentation to store change‐tracking information across all previous elements involved.

+

+ + + + + + +

+

This heavily simplified version of an OOXML document shows two separate changes: (a) the insertion of a return character and (b) the insertion of a word. These modifications are not considered as a single change; therefore, the segments are not connected to each other but simply created as needed to fit the underlying structure.

+

In fact, change tracking in OOXML is a fairly complex proposition. Although providing more complete coverage of special cases and situations than does ODT, dealing with its intricacies is not for the casual programmer. Even a simple XSLT stylesheet to show inserted text in a different color and hide deleted text may run several hundred lines of code. + 1 + +

+ http://OOXMLdeveloper.org/archive/2006/09/07/625.aspx +

+ +

+
+
+ Overlapping With Microformats +

Microformats (Allsopp, + 2007) add semantic markup to web documents by using common structures of the HTML language itself—in particular, the class attribute.

+

The HTML code is annotated using microformats to provide new semantic, machine‐processable assertions. In the following example, a plain HTML table is enriched with metadata about events + 2 + +

HCalendar, + http://microformats.org/wiki/hcalendar +

+ + and people: + 3 + +

HCard, + http://microformats.org/wiki/hcard +

+
+

+

+ + + + + + +

+

The table was enriched by additional data declaring it to be an event (a conference), and data about the event itself (URL, summary, location) and about four relevant individuals (with their names and roles within the conference) were associated where necessary to the actual content of the table.

+

So far, so good, and no overlap to speak about. Things change dramatically, though, when the overall structure of the main hierarchy (the HTML table) is at odds with the intrinsic hierarchy of the microformat data, such as if the people are organized in columns rather than rows. For instance:

+

+ + + + + + +

+

Unfortunately, vcards are a hierarchy themselves, and if the hierarchy of vcards is organized differently from the hierarchy of the HTML table, as in the latter case, it is just impossible to define the four vcards for the four people organizing the conference. Thus, in plain HTML, the choice of one of two possible presentation models for the main hierarchy of content makes trivial or completely impossible the existence of the second hierarchy.

+

A possible and partial solution to express vcard hierarchies in the latter example is RDFa (Adida et al., + 2008), a W3C recommendation. It describes a mechanism to embed RDF statements into HTML documents by using some HTML attributes (href, rel, rev, content) in combination with other ad hoc attributes (property, about, typeof) proposed in the recommendation itself.

+

+ + + + + + +

+

Since all attributes live in the context of elements, the price to pay is that to assert everything we want to assert, we often need to add some structurally unnecessary elements to the current markup hierarchy of a document, needed only to add the RDF statements (e.g., the span elements emphasized earlier). Even if that does not represent a significant problem for strict Semantic Web theorists, document architects and markup experts see this as a kludge and an inelegant compromise.

+
+
+
+ Wikis: No Overlapping Where Some Should Be +

The strength of wikis lies in their allowing users to modify content at any time. The mechanisms of change‐tracking and rollback that are characteristics of all wikis, in fact, promote users' contributions and make “malicious attacks” pointless in the long run since previous versions can be easily restored.

+

A number of tools exist that automatically discover “wiki vandalisms” and provide users with powerful interfaces to surf changes, identify differences between subsequent versions, and revert content. For instance, Huggle + 4 + +

+ http://en.wikipedia.org/wiki/Wikipedia:Huggle +

+ + is an application dealing with vandalism in Wikipedia, based on a proxy architecture and .NET technologies. A straightforward interface allows users to access any version of a page, highlights contributions of a specific user, and reverts the content to old versions.

+

Even client‐side tools—meant to be installed as browser extensions or bookmarklets—exist to extend the rollback mechanisms of Wikipedia, giving users more flexibility and control over (vandalistic) changes. For instance, Lupin + 5 + +

+ http://en.wikipedia.org/wiki/User:Lupin/Anti‐vandal_tool +

+ + is a set of javascript scripts that check a wiki page against a list of forbidden terms so that authors can identify undesirable modifications and restore previous (i.e., good) versions without a continuous control over the full content of the page; yet again, Twinkle + 6 + +

+ http://en.wikipedia.org/wiki/Wikipedia:Twinkle +

+
+ provides users powerful rollback functions and includes a full library of batch deletion functions, automatic reporting of vandals, and user notification functions.

+

These tools are successful in highlighting vandalism and in identifying versions created by malicious users. However, although it is possible to revert the page to any previous version, all changes (even acceptable ones) that were subsequent to the malicious version cannot be automatically inherited by the restored page.

+

For instance consider Versions V1, V2, and V3 of a wiki page, where Version V1 contains a baseline (i.e., acceptable) content, and Version V2 is identified as a partial vandalism and is agreed to be removed, but Version V3 contains (possibly, in a completely different section than the target of the malicious attack) relevant and useful content that was added before the vandalistic Version V2 was declared as such. The task of removing the modifications of Version V2 while maintaining (whatever is possible of) Version V3 is a difficult, error‐prone, and time‐consuming task if done manually, yet there is no tool we are aware of that automatically filters contributions from multiple versions and merges them into a new one (or, equivalently, removes only selected intermediate versions).

+

However, it is possible to theoretically characterize the interdependencies between subsequent changes to a document. In fact, literature has existed for a long time on exactly these themes (e.g., Durand, + 1994, + 2008). Although a detailed discussion of abstract models of interconnected changes is out of scope for this article (Details and authoritative references can be found in the aforementioned works.), what is relevant in this discussion is that they happen to assume a hierarchical form that is frequently at odds with the hierarchical structure of the content of the document, and as such, most issues derive from the data structures in which content is stored and from the model for manipulating these structures. For instance, the fact that in the wiki perspective each version is an independent unit that shares no content (even unchanged content) with the other versions prevents considering multiple versions as overlapping structures coexisting on the same document. If we were able to make these hierarchies explicit, we would be able to create models and tools to manipulate these documents in a more powerful way and to exploit the existing interconnections between the overlapping hierarchies.

+
+
+ Introduction to EARMARK and Its Support for Overlapping Features +

The presence of hidden overlapping structures—transparent to users, but very difficult to handle by applications—is the common denominator for the scenarios described in the previous section. More than the overlap itself, which cannot be ignored because it does exist and carries important meanings, the problem we face lies in the way applications store such overlapping structures. In the XML world, in fact, the only way to do so is through the use of (complex) workarounds that force the multiple hierarchies into one hierarchy of an XML document. That makes it very tricky to perform sophisticated analysis and searches.

+

This section discusses a different approach to metamarkup, EARMARK (Di Iorio, Peroni, & Vitali, + 2009; Di Iorio et al., + 2010; Peroni & Vitali, + 2009) based on ontologies and Semantic Web technologies. The basic idea is to model EARMARK documents as collections of addressable text fragments, and to associate such text content with OWL assertions that describe structural features as well as semantic properties of (parts of) that content. As a result, EARMARK allows not only documents with single hierarchies (as with XML) but also multiple overlapping hierarchies where the textual content within the markup items belongs to some hierarchies, but not to others. Moreover, EARMARK makes it possible to add semantic annotations to the content though assertions that may overlap with existing ones.

+

One of the advantages of using EARMARK is the capability to access and query documents by using well‐known and widely supported tools for the Semantic Web. In fact, EARMARK assertions are simply RDF assertions while EARMARK documents are modeled through OWL ontologies. The consequence is that query languages (e.g., SPARQL; Garlik & Seaborne, + 2010) and actual existing tools such as Jena + 7 + +

+ http://jena.sourceforge.net +

+ + and Pellet + 8 + +

+ http://pellet.owldl.com +

+
+ can be directly used to deal with even incredibly complicated overlapping structures. What is very difficult (or impossible) to do with traditional XML technologies becomes much easier with these technologies under the EARMARK approach.

+

In the rest of this section, we give a brief overview of the EARMARK model and then describe how EARMARK can be used to deal with the issues presented earlier. The model itself is defined through an OWL document, + 9 + +

+ http://www.essepuntato.it/2008/12/earmark +

+ + summarized in Figure + 1, specifying classes and relationships. We distinguish between ghost classes, which define the general model, and shell classes, which are actually used to create EARMARK instances.

+
+ + + + + + + + +

A UML‐like representation of the EARMARK ontology. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+
+ Ghost Classes +

The ghost classes describe three disjoint base concepts—docuverses, ranges, and markup items—through three different and disjoint OWL classes. + 10 + +

All our OWL samples are presented using the Manchester Syntax (Horridge & Patel‐Schneider, + 2009), which is one of the standard linearization syntaxes of OWL. The prefixes rdfs and xsd refer to RDF Schema and XML Schema namespaces, respectively, while the empty prefix refers to the EARMARK ontology URI plus “#.” Moreover, we use the prefix c to indicate entities taken from an imported ontology made for the SWAN project (Ciccarese et al., + 2008); available at + http://swan.mindinformatics.org/spec/1.2/collections.html +

+ +

+

The textual content of an EARMARK document is conceptually separated from its annotations, and is referred to through the Docuverse class. + 11 + +

This class (and its name) is based on the concept introduced by Ted Nelson ( + 1980) in his Xanadu Project to refer to the collection of text fragments that can be interconnected to each other and transcluded into new documents.

+ + The individuals of this class represent the object of discourse (i.e., all the containers of text of an EARMARK document).

+

+ + + + + + +

+

Any individual of the Docuverse class—commonly called a docuverse (lowercase to distinguish it from the class)—specifies its actual content with the property hasContent.

+

We then define the class Range for any text lying between two locations of a docuverse. A range (i.e., an individual of the class Range) is defined by a starting and an ending location (any literal) of a specific docuverse through the properties begins, ends, and refersTo, respectively.

+

+ + + + + + +

+

There is no restriction on locations used for the begins and ends properties. That is very useful because it allows us to define ranges that follow or reverse the text order of the docuverse to which they refer. For instance, the string “desserts” can be considered both in document order, with the begins location lower than the ends location, or in the opposite order, forming “stressed.” + 12 + +

+ http://en.wikipedia.org/wiki/Palindrome#Semordnilaps +

+ + Thus, the values of the properties' begins and ends define the way a range must be read.

+

The class MarkupItem is the superclass defining artifacts to be interpreted as markup (e.g., elements and attributes).

+

+ + + + + + +

+

A markupitem individual is a collection (c:Set, c:Bag, or c:List, where the latter is a subclass of the second one, and all of them are subclasses of c:Collection) of individuals belonging to the classes MarkupItem and Range. Through these collections, it is possible to define a markup item as a set, a bag, or a list of other markup items, using the properties element (for sets) and item and itemContent (for bags and lists). Thus, it becomes possible to define elements containing nested elements or text, or attributes containing values, as well as overlapping and complex structures. Note also that handling collections directly in OWL allows us to reason about content models for markup items, which would not be possible if we had used the corresponding constructs in RDF. + 13 + +

+ http://hcklab.blogspot.com/2008/12/moving‐towards‐swan‐collections.html +

+ +

+

A markupitem also might have a name, specified in the functional property hasGeneralIdentifier (recalling the SGML term to refer to the name of elements; Goldfarb, + 1990), and a namespace, specified using the functional property hasNamespace. Note that we can have anonymous markup items—as is possible in LMNL (Tennison & Piez, + 2002) and in GODDAG (Sperberg‐McQueen & Huitfeldt, + 2004)—by simply asserting that the item belongs to the class of all those markupitems that do not have a general identifier (i.e., hasGeneralIdentifier exactly 0).

+
+
+ Shell Classes +

The ghost classes discussed so far give us an abstract picture of the EARMARK framework. We need to specialize our model, defining a concrete description of our classes. These new shell subclasses apply specific restrictions to the ghost classes.

+

First, the class Docuverse is restricted to be either a StringDocuverse (i.e., the content is specified by a string) or a URIDocuverse (i.e., the actual content is located at the URI specified).

+

+ + + + + + +

+

Depending on particular scenarios or on the kind of docuverse we are dealing with (plain‐text, XML, LaTeX, a picture, etc.), we need to use different kinds of ranges. Therefore, the class Range has three different subclasses: + + +

PointerRange defines a range by counting characters. In that case, the value of the properties' begins and ends must be a nonnegative integer that identifies unambiguous positions in the character stream, remembering that the value 0 refers to the location immediately before the first character, the value 1 refers to the location after the first character and before the second one, and so on. By using the hasKey OWL property, we also assert that two pointer ranges having equal docuverse and begin and end locations are the same range.

+ + +

XPathRange defines a range considering the whole docuverse or its particular context specifiable through an XPath expression (Berglund et al., + 2007) as value of the property hasXPathContext. Note that by using these ranges, we implicitly admit that the docuverse it refers to must be an XML structure. Moreover, the properties' begins and ends have to be applied on the string value obtained by juxtaposing all the text nodes identified by the XPath. By using the hasKey OWL property, we also assert that two xpath ranges having equal docuverse, XPath context, and begin and end locations are the same range.

+
+ +

XPathPointerRange is an XPathRange in which the value of the properties' begins and ends must be a nonnegative integer that identifies unambiguous positions in the character stream as described for the class PointerRange.

+
+ +

+

+ + + + + + +

+

MarkupItem is specialized in three disjointed subclasses—Element, Attribute, and Comment—that allow a more precise characterization of markup items.

+

+ + + + + + +

+
+
+ Range and Markup Item Overlap +

The presence of overlap in EARMARK is worth discussing in more detail. Different types of overlap exist, according to the subset of items involved, and different strategies are needed to detect them. In particular, there is a clear distinction between overlapping ranges and overlapping markup items.

+

By definition, overlapping ranges are two ranges that refer to the same docuverse, so that at least one of the locations of the first range is contained in the interval described by the locations of the second range (excluding its terminal points). Totally overlapping ranges have the locations of the first range completely contained in the interval of the second range, or vice versa, while partially overlapping ranges have either exactly one location inside the interval and the other outside or identical terminal points in reversed roles.

+

Thus, if we consider the following excerpt:

+

+ + + + + + +

+

we can infer, through a reasoner such as Pellet, that these two ranges overlap by using the following rules:

+

+ + + + + + +

+

where P is one of: + + +

lessThan(b1,e1) ˆ greaterThan(b2,b1) ˆ lessThan(b2,e1)

+ + +

lessThan(b1,e1) ˆ greaterThan(e2,b1) ˆ lessThan(e2,e1)

+
+ +

lessThan(e1,b1) ˆ greaterThan(b2,e1) ˆ lessThan(b2,b1)

+
+ +

lessThan(e1,b1) ˆ greaterThan(e2,e1) ˆ lessThan(e2,b1).

+
+ +

+

The case of overlapping markup items is slightly more complicated. We define that two markup items A and B overlap when at least one of the following sentences holds: + + +

[Overlap by range]: A contains a range that overlaps with another range contained by B.

+ + +

[Overlap by content hierarchy]: A and B contain at least a range in common.

+
+ +

[Overlap by markup hierarchy]: A and B contain at least a markup item in common.

+
+ +

+

The three possible scenarios for such item overlap are summarized in Figure + 2. + 14 + +

The EARMARK documents describing these three overlapping scenarios and all the other ones presented in the following sections are available at + http://www.essepuntato.it/2011/jasist/examples +

+ +

+
+ + + + + + + + +

Three EARMARK examples of overlapping between elements p. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The EARMARK ontology, in fact, is completed by another ontology + 15 + +

+ http://www.essepuntato.it/2011/05/overlapping +

+ + that models all overlapping scenarios, either for ranges or markup items, and includes rules for automatically inferring overlaps through a reasoner.

+
+
+ EARMARK as a Standoff Notation +

If we ignore for a moment the semantic implications of using EARMARK and concentrate on its syntactical aspects only, it is easy to observe that EARMARK is nothing but yet another standoff notation, where the markup specifications point to, rather than contain, the relevant substructure and text fragments.

+

Standoff notations, also known in literature as out‐of‐line notations (TEI Consortium, + 2005), are hardly new, but never really caught on for a number of reasons, most having to do with their perceived fragility under the circumstances of desynchronized modification to the text. In Georg, Schonefeld, Trippel, and Witt ( + 2010) and Bański ( + 2010), we can find a pair of recent and substantially complete analyses of their merits and demerits. In particular, according to Georg, Schonefeld, Trippel, and Witt ( + 2010), “standoff annotation has … quite a few disadvantages: + + +

very difficult to read for humans

+ + +

The information, although included, is difficult to access using generic methods.

+
+ +

Limited software support as standard parsing or editing software cannot be employed.

+
+ +

Standard document grammars can be used only for the level which contains both markup and textual data.

+
+ +

New layers require a separate interpretation.

+
+ +

Layers, although separate, often depend on each other.” + 16 + +

To individually address the issues, we edited the original bullets into a numbered list.

+ +

+
+ +

+

And yet, although EARMARK is in practice a standoff notation, it provides a number of workarounds to most of the aforementioned issues.

+

First, since EARMARK is based on OWL and can be linearized in any of the large number of OWL caricaturization syntaxes, it follows that (a) readability, (b) access, and (c) software support for it are exactly those existing for well‐known, widespread, and important W3C standards such as RDF and OWL. Being able to employ common RDF and OWL tools such as Jena and SPARQL for EARMARK documents was in fact a major motivation for it.

+

Issue 4 should be examined beyond the mere validation against document grammars and toward a general evaluation of the level of compliancy of the markup to some formally specified expectations. EARMARK documents, while being subject to no document grammar in the stricter XML sense, allow the specification of any number of constraints, expressed either directly in OWL or SWRL (Horrocks et al., + 2004), or even in SPARQL, that trigger or generate validity evaluations. In Di Iorio, Peroni, and Vitali (in press), we tried to show that a large number of requirements, from hierarchical well‐formedness in the XML sense, to validation requirements in terms of XML DTDs, to adherence to design patterns, can be expressed satisfactorily using these technologies.

+

Issue 5 regards the difficulty of standoff notations to provide interlayer analysis on XML structures: Separate interpretation of markup layers is easy, but identification and validation of overlapping situations are more complex: Standoff markup is mainly composed of pointers to content and does not have any direct way to determine overlap locations without some kind of pointer arithmetics to compute them. Validation of contexts allowing overlaps as describable using rabbit/duck grammars (Sperberg‐McQueen, + 2006) also is not trivial. In this regard, EARMARK provides yet again a solution that does not require special tools: Although OWL does not allow direct pointer arithmetics, SWRL on the contrary does, as shown earlier where we described a batch of (SWRL‐implementable) rules that do, in fact, determine overlapping locations on EARMARK documents with good efficiency.

+

Finally, Issue 6 refers to the fact that evolution of separate markup annotation layers needs to synchronously take place, lest one of them becomes misaligned with the new state of the document. This is, in summary, the fragility of pointers, which can be considered the fundamental weakness of standoff, as well as of any notation that has markup separate from its content: If a modification occurs to the underlying (probably text‐based) source, all standoff pointers that could not be updated at the same time of the change become outdated and possibly wrong. All standoff notations fall prey to this weakness, and there is no way to completely get rid of it.

+

What is possible is to identify exactly the conditions under which such weakness acts, and see if there is a way to reduce the mere frequency of such events. In fact, for a standoff pointer to become outdated, several conditions must take place at the same time: + + +

The standoff notation must be used as a storage format, rather than just as a processing format;

+ + +

the source document must make sense even without the additional standoff markup (i.e., the standoff notation contains no information that is necessary for at least some types of document modifications);

+
+ +

the source document must be editable (and, in fact, must be edited) on its own;

+
+ +

the standoff pointers must rely on positions that change when the source is edited (e.g., character‐based locations);

+
+ +

editing must be done in contexts and with tools that cannot or do not update the standoff pointers; and

+
+ +

there must be no computable way to determine the modifications of the document (e.g., via a diff between the old and new versions).

+
+ +

+

Of course, no standoff notation can rule out that these conditions occur on their documents, but note that all six of them must occur for standoff pointers to become outdated. EARMARK is not safe from these occurrences either, but at least for the use cases here described, one or more of these conditions simply do not apply: EARMARK is mostly used as a processing format, with no need to save it on disk (Conversion from the source formats such as MS Word is described later and does not require special storage.), the data format described is either in a very specific format (e.g., MS Word or ODT) that in fact already does handle internally its data changes and requires the overlapping data exactly for this purpose, or is in fact the result of a diff action on successive versions of a document (as in the case of the wiki pages). Finally, EARMARK allows references to relatively stable fragment ids of the documents (by using XPath ranges without specifying explicitly begin and end locations) rather than the extremely fragile character locations, further reducing the chances of outdated pointers. For this reason, without being able to completely rule out the possibility of standoff pointers going wrong, we tend to consider it as a significantly little risk, at least for the use case described here.

+
+
+ Using OWL Versus RDF for Standoff Notations +

EARMARK is strongly based on OWL 2 DL (W3C OWL Working Group, + 2009) to express multiple markup layers with possible overlapping ranges over the same content. OWL 2 DL is not the only possible choice for expressing standoff notations via Semantic Web technologies. In fact, RDF is another valid and effective model for dealing with the same issue, as shown in Tummarello et al. ( + 2005), by means of the open‐source application programming interface (API) RDF Textual Encoding Framework (RDFTef). This API was created to demonstrate a plausible way for handling overlapping markup within documents and identifying textual content of a document as a set of independent RDF resources that can be linked mutually and with other parent resources.

+

Besides giving the possibility to define multiple structural markup hierarchies over the same text content, the use of RDF as the language for encoding markup allows to specify semantic data on textual content as well. But the real main advantage in using RDF is the possibility of using particular built‐in resources appositely defined in the RDF syntax specification (Beckett, + 2004) for describing and dealing with different kinds of containers, either ordered (rdf:Seq) or unordered (rdf:Bag). Thus, RDF resources can be used to represent every printable element in the text—words, punctuation, characters, typographical symbols, and so on—while RDF containers also can be used to combine such fragments and containers.

+

Although RDF is not sufficient to define a formal vocabulary for structural markup, does a given resource represent an element, an attribute, a comment, or a text node? In which way is a resource of a certain type related to others? The specification of an RDFS (Brickley & Guha, + 2004) or of an OWL layer can successfully address these issues. Hybrid solutions obtained by mixing different models, even when they are built one upon another, may seem elegant, but not necessarily the best choice. In fact, there exist well‐known interoperability limits between OWL 2 DL and RDF that prevent the correct use of Semantic Web tools and technologies. In particular: + + +

Any markup document made using RDF containers (e.g., to describe what markup items contain and in which order) and OWL ontologies (e.g., to define classes of markup entities and their semantics) results in a set of axioms that end up outside of OWL DL and well within OWL Full, which limits the applicability of the most frequently used Semantic Web tools that are usually built upon the (computationally tractable) description logic underlying OWL 2 DL.

+ + +

The individual analysis of each language may be not applicable when we have to check particular properties that lie between RDF and OWL layers. For example, verifying the validity of a markup document against a particular schema, which is one of the most common activities with markup, needs to be made to work with both markup item structures (that would be defined in RDF) and logical constraints about classes of markup items (e.g., elements only, attributes only, the element “p,” all the element of a particular namespace, etc., all of them definable in OWL).

+
+ +

+

Being able to express everything we need directly in OWL quite straightforwardly addresses both issues. The well‐known absence of containers and sequences in OWL can be overcome by modeling classes in specific ways using specific design patterns such as those in Ciccarese et al. ( + 2008) and in Drummond et al. ( + 2006).

+
+
+
+ Using EARMARK +

There are multiple applications for the EARMARK approach. The most interesting for this article is its capability of dealing with overlapping structures in an elegant and straightforward manner. Under EARMARK, such structures do not need to be specified through complex workarounds as with XML, but they are explicit and can be easily described and accessed. Sophisticated searches and content manipulations become very simple when using this ontological model.

+

The goal of this section is to demonstrate the soundness and applicability of EARMARK by discussing how the use cases presented earlier are addressed. Note that throughout the section we investigate multiple EARMARK data structures and documents, focusing on the feasibility and potentiality of such an ontological representation.

+
+ Looking for Authorial Changes in Office Documents +

The discussion about change tracking in office document formats showed that both ODT (OpenOffice format) and OOXML (Microsoft Word format) use complex data structures to store overlaps generated by change‐tracking functionalities. These structures make it very difficult to search and manipulate the content when using XML languages and tools. Even very simple edits generate a rather tangled set of overlapping elements.

+

Let us recall the example mentioned earlier where the user “John Smith” splits a single paragraph into two. The ODT representation is:

+

+ + + + + + +

+

The OOXML representation (shown earlier) is even more complex. In fact, these formats exploit in large scale (tangled) fragmentation (OOXML) or milestones and stand‐off markup (ODT) to deal with overlaps.

+

EARMARK, on the other hand, stores overlapping data in a direct and streamlined manner that does not require tools to rebuild information from the twists of a tree‐based XML structure. The information already is available and expressed through consistent RDF and OWL statements. Figure + 3 graphically shows the corresponding EARMARK document.

+
+ + + + + + + + +

Encoding in EARMARK the ODT change‐tracking example. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The original paragraph content and the new string “also” are now encoded as two docuverses over which the ranges r1, r2, and r3 are defined. The original paragraph is then composed of the (content of) ranges r1 and r2 while the paragraphs resulting after the (text and carriage return) insertion now comprise range r1 and ranges r2 and r3, respectively. Metadata about the author and the modification date are encoded as further RDF statements.

+

+ + + + + + +

+

The advantages of streamlining overlaps becomes apparent if we consider tasks a little beyond the mere display. For instance, the query for “the textual content of all paragraphs inserted by John Smith” ends up rather entangled if we used XPath on the ODT structure. The process for finding that textual content needs to browse the current version of the document and look for all the text:change‐start/text:change‐end pairs that refer to an insertion made by John Smith involving the creation of a new paragraph (i.e., text:change‐start is in a first paragraph while its pair, text:change‐end, is in the following one) that are either currently present in the document body or hidden behind a subsequent deletion made by someone else. Once the paragraphs are identified, we need to retrieve the content that originally was contained there (i.e., the text fragments that still are within those boundaries or that may have been deleted in subsequent versions). The following XPath represenst an implementation of this process:

+

+ + + + + + +

+

The XML structure of an MS Word file, using segmentation rather than milestones, does simplify the query a bit, but still presents some radical complexities. The process starts by choosing all those w:p elements that were inserted by John Smith as well as all their previous and contiguous w:p elements that were deleted before or inserted after the first ones. In OOXML, each sequence of contiguous w:p elements implicitly represents one paragraph. Therefore, we can now take all the text fragments contained in each w:p sequence that were inserted before or deleted after the paragraph defined by the sequence itself. The following is the resulting XPath for an OOXML document.

+

+ + + + + + +

+

The complexity of both XPath queries is due to the intrinsic complexity of the data structure on which the query has to work. Although the interface of OpenOffice or MS Word may provide tools to directly deal with these queries using specific strategies on the internal data structures, applications working directly on the XML structure have very little help in disentangling the mess of the data formats.

+

On the other hand, since EARMARK documents are actually OWL files, it is possible to access and query them with plain Semantic Web tools. Powerful searches then can be performed without using niche‐specific tools or complex and long XPath expressions simply with mainstream technologies such as SPARQL (Garlik & Seaborne, + 2010).

+

The corresponding SPARQL query for (“the textual content of all paragraphs inserted by John Smith”) therefore can be written as follows:

+

+ + + + + + +

+

But EARMARK is useful for even more than querying: EARMARK also decreases the costs, in terms of efforts and lines of code, for manipulating documents.

+

Let us consider the task of generating an intermediate version (i.e., neither the first nor the last one of a version chain) from a document that includes change‐tracking information about the whole document history.

+

The process of rebuilding these versions by working on the XML structure without specific APIs is both complex and inefficient. For example, a basic XSLT that returns an XML document defining the desired version requires to at least: + + +

define templates for all the elements actively involved in the change tracking (e.g., for ODT, text:changed‐region, text:change‐start, text:change‐end, and text:change and similarly for OOXML) to understand, by looking at their creation date, whether they must be considered or ignored when building the requested version. In particular, we must exclude insertions following and deletions preceding the version we are building;

+ + +

define templates for paragraphs to handle cases where the paragraph is the result of an insertion or a deletion of other paragraphs to identify whether it should be considered for the result and, in such case, finding out its real text content and remembering that in the following versions, such content may have spread out among other paragraphs;

+
+ +

define templates for handling insertions/deletions for structures such as images, sections, lists, and tables; and

+
+ +

define an identity template for the other elements to visit the entire document.

+
+ +

+

Even the most basic and incomplete implementation of such XSLT requires hundreds of lines of complex and convoluted code and a large number of ad hoc decisions based on the specificities of whether we start from ODT or OOXML. Note also that a Java‐based implementation (or in any other procedural language) of the same process would be equally or even more complex.

+

The same result can be achieved on EARMARK documents with a few lines of Java code:

+

+ + + + + + +

+

This approach uses the EARMARK Java API + 17 + +

+ http://earmark.sourceforge.net +

+ + and a single SPARQL query, runnable on any SPARQL 1.1 processor such as Jena, to identify the root node of the subtree of the version that is associated with the specified date and creator. Then, it performs a simple, recursive, deep‐first visit to clone all the nodes in the tree and to combine them in the output EARMARK document.

+

This method heavily uses Semantic Web technologies on the structures provided by EARMARK whose characteristics are always explicit and clear. In fact, since all versions coexist within the EARMARK document and each version can be encoded explicitly as a tree within the overall graph, this operation is straightforward and fast.

+
+
+ Improving Semantic Annotations +

EARMARK also can be exploited to improve semantic annotations. As noted earlier, there are in fact strong limitations in the same process of annotating web documents with semantic structures that overlap the structural ones. The same example—of vcards that cannot be created on the top of tables organized per rows—will be used in this section.

+

We solve this by converting the web document with annotations into an EARMARK document, allowing both semantic and structural annotations to coexist. Through EARMARK, we can explicitly express both markup structures and vcard assertions. Figure + 4 shows how the vcard example can be modeled (Once again, we show a graphical representation for the sake of clarity.)

+
+ + + + + + + + +

The abstract model of the EARMARK document solving the microformats issue. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The textual content of the original table cells is now encoded in two different docuverses: one for the header (with roles) and one for the body (with names of committee members). Ranges r1, r2, …, r8 are then created to distinguish each role and name. Two independent and coexisting hierarchies are then built on top of the same set of ranges: the HTML table that includes one cell for each range (in blue) and the vcards about each person (in green) that include only the relevant ranges and overlap the previous one. Note also that the vcards are defined in such a way that does not interfere with the structural features of the table. The full linearization in OWL of this example can be found at + http://www.essepuntato.it/2011/jasist/examples +

+
+
+ Improving Wiki Content Reversions +

EARMARK can be used to improve wiki reversion mechanisms and overcome the limitations discussed earlier: The automatic filtering and merging of contributions from multiple versions of the same page are still a manual process, but it can be fully automatized if the overlapping structures buried in the whole history of the page become explicit.

+

The role of EARMARK is to make those structures explicit and available for more sophisticated content manipulation. To understand the extent EARMARK structures can be derived from wikis and how they can be exploited by the final users, we use as our example the wiki platform MediaWiki + 18 + +

+ http://www.mediawiki.org +

+ + (i.e., the wiki engine of Wikipedia).

+

MediaWiki offers sophisticated functionalities for creating diffs of wiki content. Users can compare any two revisions in the page history and highlight changes in a friendly interface that shows modifications with a word‐level granularity. Diff pages contain metadata about each compared version (when the version was created, who the author was, or which IP address an anonymous author was connected from, etc.) and a two‐column table showing the changes side by side. Changes are detected a posteriori by comparing two arbitrary versions, which are not even required to be temporally contiguous.

+

The output of the MediaWiki diff engine has regularities that can be exploited to automatically build the overlapping structures of the diff and to express them in EARMARK. Let us consider a fictitious example summarized in Table + 1, where an initial text is revised three times by different authors.

+ + + All the versions of a wiki page modified by different authors. + + + + + + + + + + Version Author + V1 151.61.3.122 + V2 Angelo Di Iorio + V3 Silvio Peroni + V4 Fabio Vitali + + + + + Content + Bob was farming carrots and tomatoes. + Bob was farming carrots, tomatoes and beans. + Bob was farming carrots, tomatoes and green beans. They were all tasteful. + Bob was farming carrots, tomatoes and green beans. [new paragraph] They were all tasteful. + + + +
+
+

To display the differences between V1 ad V2, Mediawiki creates a page whose HTML code is as follows: + 19 + +

For the sake of clarity, we removed all markup irrelevant to our discussion.

+ +

+

+ + + + + + +

+

This is an HTML table of two rows, the first showing metadata (date and author of the modification), and the second showing the actual modifications. The first cell of the second row contains all the unmodified text and a del element for each inline fragment that was deleted. The second cell contains all the unmodified text and an ins element for each inline fragment that was inserted. Thus, these cells share exactly the same unmodified part(s) of the two compared versions.

+

When the structure itself is modified rather than merely the text, the source code of the MediaWiki diff is slightly different. Thus, the diff between V3 and V4 (which splits a paragraph in two) is as follows:

+

+ + + + + + +

+

The diff output is not complete or sophisticated, and of course, it is a completely different task to replan such an algorithm (but for a first idea of natural changes in diffing XML documents, see Di Iorio, Marchetti, Schirinzi, & Vitali, + 2009). Thus, limitations of that algorithm are inevitably shared by any EARMARK representation. Yet, this output is sufficiently rich to allow us to extract the overlapping information we need. For instance, the insertion of a nonbreaking space or a carriage return generates rows according to specific rules that can be easily detected to capture the actual change by the author.

+

Figure + 5 shows the aforementioned example rebuilt in EARMARK. All versions are encoded in the same document by creating overlapping assertions over the docuverses. Metadata and RDF statements are layered on top of those assertions and create a rich knowledge‐base about the history of the documents and, in particular, about the history of each fragment.

+
+ + + + + + + + +

The wiki sample versions encoded in a single EARMARK document. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

Due to the complexity of the example, we labeled arrows with numbers that indicate the position of each range within each markup item. Consider, for instance, Version V4: It is composed of two DIV elements, the first one containing the concatenation of “Bob was farming carrots” + “,” + “tomatoes” + “and” + “green” + “beans” + “.” and the second one containing the string “They are all tasteful.”

+

Implementing a wiki content‐filtering mechanism on top of such a structure is rather simple. For instance, the removal of all the contributions of “Angelo Di Iorio” that leaves untouched all the content written (previously and subsequently) by “Silvio Peroni” and “Fabio Vitali” can be performed straightforwardly. Three steps are enough to apply such an intermediate content reversion: + + +

the identification of the fragments written by “Angelo Di Iorio,” which is a straightforward SPARQL query on the embedded statements;

+ + +

the creation of a new version where references to those fragments are removed and references to fragments no longer in the document are correctly fixed;

+
+ +

the translation of that document into an actual MediaWiki page through the serialization process described in Peroni and Vitali ( + 2009).

+
+ +

+

Of course, an automatic process may generate ambiguities or even errors in the resulting content (i.e., some parts may become dangling, wrong, or unclear after removing text fragments elsewhere); grammar discrepancies also might be generated by the same approach. Linguistic and semantic problems, however, become a problem once the technical issues of managing independent, yet successive, edits are solved. What is important is that all the information about overlaps and dependencies among fragments is available in EARMARK and can easily be searched, filtered, and manipulated. Besides, foreseeing a manual intervention for checking and polishing automatically filtered content is perfectly in line with the wiki philosophy, so that the wiki community itself can wisely use the reversion tools to revise the content and adjust any intervening minor nuisances or imperfections. Such checks would still be far simpler and faster than would the manual process of partially reverting versions as we have today.

+
+
+
+ Generating EARMARK From Existing Documents: The ROCCO Approach +

Since we do not expect documents to be natively written in EARMARK or manually created by users, we need a way to extract EARMARK data structures from existing XML‐based resources, which is trivial when the XML is simple and clearly hierarchical and slightly more complex when the XML contains workarounds to force an intrinsically overlapping situation into a single hierarchy.

+

We designed a reliable process to transform XML files into EARMARK documents that fully captures overlapping structures even when the overlaps are hidden in one the many well‐known workarounds. This approach takes as input an XML file and produces the corresponding EARMARK document in five steps: Read, Overhaul, Convert, Classify, and Organize (hence, the name ROCCO).

+

Since ROCCO is not the main topic of this article, we very briefly discuss the issue of converting XML into EARMARK, explaining how each step works. The ROCCO algorithm performs five steps, described next.

+
+ Read and Overhaul +

The first two steps consist of loading the XML source file and, if needed, adding information useful for further processing. In EARMARK, there is a clear distinction between the textual content of a document and the structures built on top of it: The content is stored as plain text—within docuverses—and all structures are externalized and expressed through OWL and RDF assertions.

+

While OpenOffice stores all overlapping structures in the main document file, some other editors (e.g., MS Word) store overlaps in many different ways, even in a separate file. The overhaul step extracts such data and adds them to the main content document by exploiting format‐specific procedures, implemented via XSLT in most cases.

+
+
+ Convert +

The subsequent step consists of converting the XML source file into an early EARMARK document that expresses exactly the same information and hierarchies. No interpretation or disentanglement of workarounds is performed at this step.

+

Since the input is XML, this translation can be performed directly via a generic XSLT stylesheet. It basically consists of a recursive algorithm that parses the source file and generates the corresponding instances in the EARMARK ontology. Such a translation is straightforward and not difficult.

+
+
+ Classify +

The “Classify” step extends the EARMARK document built so far with information about the workarounds used to encode overlaps. That information will be exploited in the subsequent steps to make those overlaps explicit.

+

The basic idea is to exploit OWL reasoners to detect workarounds in an early EARMARK document D by: + + +

defining an ontology O that models all the workarounds used by applications, such as milestones, stand‐off markup, etc.; these workarounds are specific to the data format used in the source document;

+ + +

specifying the EARMARK document D as an ABox for the ontology O;

+
+ +

defining SWRL rules that capture the role of each element in D and check relationships between elements;

+
+ +

running an OWL reasoner, such as Pellet, on D+O to create new OWL instances and properties that identify which workarounds are present.

+
+ +

+

The actual detection of workarounds is delegated to an external reasoner. Refining detection strategies and even adding new strategies for new formats all can be done via OWL and SPARQL. Indeed, tricky issues need to be addressed—mostly depending on the idiosyncrasies of the original formats—but no procedural code is required.

+
+
+ Organize +

The final step consists of building yet another EARMARK document that expresses the overlaps and metadata in an explicit way, based on the information collected by the previous steps. This phase consists of mapping operations from the native format into the EARMARK structure. Such conversion relies on the identification of metadata to classify the operations and to externalize relevant metadata in separate RDF statements.

+
+
+
+ Evaluating EARMARK +

One of the most frequent criticisms when proposing a different approach to solving a well‐known problem in information and communication technology is that the new solution may simplify the difficulties of the specific problem, but brings with it hidden costs in terms of size of the data structure, computation efforts, or conversion restrictions that compensate the advantages. In our case, one of the anonymous reviewers of our article (Di Iorio, Peroni, & Vitali, + 2009) wondered whether a difference in file size could weigh in on the convenience of adopting EARMARK as opposed to working with the original files.

+

As such, a discussion of cost functions of EARMARK versus other formats is in order. Yet, a systematic discussion of the relative costs (e.g., in byte size) of some original XML‐based data structures versus their EARMARK equivalent is an open‐ended undertaking that heavily depends on the original XML data structure and the specific features present in the document, and is badly defined anyway: While XML is a linearization format immediately expressible in actual bytes, OWL (or more precisely, RDF, the language in which OWL ontologies are expressed) is an abstract structure that allows a large number of linearization formats (including XML itself) with corresponding huge differences in the final byte counts.

+

For these reasons, to provide at least an initial test of meaningful concepts, we selected two XML‐based data formats (OOXML and ODT) and, specifically, a set of documents where overlapping tricks were present (i.e., where change‐tracking was active). To bypass the size discussion, we decided not to test byte lengths (which are not meaningful and easily skewed, e.g., by reducing the string length of the element names or of the class names) but the number of nodes for XML documents and of triples for OWL documents. This comparison again is not particularly appropriate (Triples are naturally numerous in OWL ontologies, and it is customary to deal with hundreds of thousands and even millions of assertions in Semantic Web applications.) but closer to meaningfulness than is the mere byte count.

+

Our comparison was carried on a small set of documents in ODT and OOXML that included change‐tracking information. As discussed in the previous sections, change‐tracking facilities generate rather complex overlaps even for basic operations on small text fragments, which in turn are expressed as a potentially huge number of standoffs and milestone markup within the XML hierarchy. The same documents were individually converted into EARMARK. We then charted how simple edits under change‐tracking affect the number of nodes in XML formats and of statements in OWL files. + 20 + +

The full details about each version and each format also are available at + http://www.essepuntato.it/2011/jasist/discussion +

+ +

+

We created seven different versions, named after the “Seven Dwarfs” for recognizability, by applying very common edits (e.g., the insertion of few words, the deletion of some sentences, the split of a paragraph, etc.) on a small document, creating multiple overlaps. Figure + 6 shows the results of our comparison.

+
+ + + + + + + + +

A graph summarizing the results of the first experiment. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

The overall trend is interesting and comforting: While in simple documents with no overlap the node count of XML is lower than is the assertion count of EARMARK triples, the presence of overlaps makes EARMARK and XML formats comparable. The growth of EARMARK statements is in fact very close to the growth of XML nodes when the number of overlaps increases. EARMARK is even more efficient than is XML for more complex documents.

+

The measure for each format was done by counting only those nodes and statements instrumental to encode content and (overlapping) structures: We did not take into account either the presentational information for ODT and OOXML (Each file, for instance, includes a very long list of style definitions that are not relevant for the purposes of our analysis.) or namespace declarations (OOXML files, for instance, list all relevant namespaces for the Office toolkit.) or ignorable white spaces (that are only added to indent content and improve readability).

+

Interestingly, EARMARK and ODT show a very similar increase in size while OOXML is much more verbose and grows faster. The content of the first version, for instance, is encoded using four nodes in ODT, 13 statements in EARMARK, and 54 nodes in OOXML; the last one contains 241 ODT nodes, 233 EARMARK statements, and 452 OOXML nodes. To return to our original inquiry, it is clear that the weight of EARMARK documents is very good compared to the other ones.

+

Also note the regularity in the growth of EARMARK statements. Regardless of the actual modifications applied to the document, in fact, EARMARK adds about 40 statements for each edit. Both OOXML and ODT, on the contrary, show a more irregular “pace.” The reason for this is that EARMARK externalizes all assertions, so that all modifications (either to leaf nodes or to intermediate nodes in the original XML) are “flattened” onto the docuverses and do not depend on the complexity of the structure within which the edit took place.

+

Figure + 7 shows the results of a similar comparison on a different set of documents and edits. We collected seven versions named after the days of the week and created by seven different authors when editing a very simple document. The overall trend does not change, and shows that EARMARK and ODT again have a comparable behavior, far better than that of OOXML.

+
+ + + + + + + + +

A graph summarizing the results of the second experiment. [Color figure can be viewed in the online issue, which is available at + wileyonlinelibrary.com.]

+ +
+

In conclusion, although preliminary, this study shows clear trends of a very conservative behavior of EARMARK with respect to document size.

+
+
+ Conclusions +

Overlaps, far from being an obscure requirement for sophisticated functionalities of arcane markup languages, are a very frequent undertaking even in major data formats and in rather frequent situations. Yet, since the XML language does not allow them, consciously or not, designers of data formats have adopted a huge and entangled array of tricks, special cases, and workarounds that, although solving the actual problem of storing overlapping structures, open new and complicated ones when approaching even basic chores on documents containing them, such as queries.

+

The EARMARK approach drastically reduces the efforts needed to perform such chores on overlapping structures since it does not allow the corresponding multiple trees to actually entangle and complicate the job. EARMARK is radically different from both special markup metalanguages that allow overlaps and the introduction of workarounds within the traditional tree‐oriented XML language because it treats multiple trees over the same content as first‐class citizens of the language, yet uses well‐known and standard W3C technologies and languages to perform all tasks. EARMARK documents, at the end, are OWL ontologies. Thus, any Semantic Web technology (e.g., SPARQL) can be used straightforwardly to perform operations on their content.

+

Improving queries is not the only application of EARMARK. Validation is another interesting field that we are investigating. In fact, the same ontological framework can be used to prove properties concerning a document, such as validity against a schema, compliance to co‐constraint specifications, or adherence to structural patterns. Moreover, inspired by Marcoux and Rizkallah ( + 2009), in which they described an approach for defining natural‐language semantics for XML‐based languages, we also are developing an ontology‐based approach for encoding markup semantics—that is, the formal definition of meanings of markup elements, besides the syntactical structure of a markup document—within EARMARK documents.

+
+ + References + + + + Adida, + B. + , + + Birbeck, + M. + , + + McCarron, + S. + , & + + Pemberton, + S. + + ( + 2008). RDFa in XHTML: Syntax and processing. W3C Recommendation, October 14, 2008, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/rdfa‐syntax/ + + + + + + Allsopp, + J. + + ( + 2007). + Microformats: Empowering your markup for Web 2.0. + New York, NY: + Friends of ED Press. + + + + + + Bański, + P. + + ( + 2010). + Why TEI stand‐off annotation doesn't quite work: And why you might want to use it nevertheless. In + Proceedings of Balisage: The Markup Conference 2010. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://www.balisage.net/Proceedings/vol5/html/Banski01/BalisageVol5‐Banski01.html + + + + + + Beckett, + D. + + ( + 2004). RDF/XML syntax specification (Rev.). W3C Recommendation, February 10, 2004, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/2004/REC‐rdf‐syntax‐grammar‐20040210/ + + + + + + Berglund, + A. + , + + Boag, + S. + , + + Chamberlin, + D. + , + + Fernández, + M.F. + , + + Kay, + M. + , + + Robie, + J. + , & + + Siméon, + J. + + ( + 2007). XML Path Language (XPath) 2.0. W3C Recommendation, January 23, 2007, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/xpath20/ + + + + + + Brickley, + D. + , & + + Guha, + R.V. + + ( + 2004). RDF Vocabulary Description Language 1.0: RDF Schema. W3C Recommendation, February 10, 2004, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/rdf‐schema/ + + + + + + Ciccarese, + P. + , + + Wu, + E. + , + + Kinoshita, + J. + , + + Wong, + G. + , + + Ocana, + M. + , + + Ruttenberg, + A. + , & + + Clark, + T. + + ( + 2008). + The SWAN biomedical discourse ontology. + Journal of Biomedical Informatics, + 41( + 5), + 739– + 751. + + + + + + DeRose, + S. + ( + 2004). + Markup overlap: A review and a horse. In + Proceedings of the Extreme Markup Languages 2004. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://conferences.idealliance.org/extreme/html/2004/DeRose01/EML2004DeRose01.html + + + + + + Di Iorio, + A. + , + + Marchetti, + C. + , + + Schirinzi, + M. + , & + + Vitali, + F. + + ( + 2009). + Natural and multi‐layered approach to detect changes in tree‐based textual documents. In + + J. + Cordeiro + & + + J. + Filipe + (Eds.), + Proceedings of the 11th International Conference on Enterprise Information Systems (ICEIS 2009) (pp. + 90– + 101). + Heidelberg, Germany: + Springer. + + + + + + Di Iorio, + A. + , + + Peroni, + S. + , & + + Vitali, + F. + ( + 2009). + Towards markup support for full GODDAGs and beyond: The EARMARK approach. In + Proceedings of Balisage: The Markup Conference 2009. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://balisage.net/Proceedings/vol3/html/Peroni01/BalisageVol3‐Peroni01.html + + + + + + Di Iorio, + A. + , + + Peroni, + S. + , & + + Vitali, + F. + + ( + 2010). + Handling markup overlaps using OWL. In + + P. + Cimiano + & + + H. S. + Pinto + (Eds.), + Proceedings of the 17th International Conference on Knowledge Engineering and Knowledge Management (EKAW 2010) (pp. + 391– + 400). + Heidelberg, Germany: + Springer. + + + + + + Di Iorio, + A. + , + + Peroni, + S. + , & + + Vitali, + F. + (in press). + Using Semantic Web technologies for analysis and validation of structural markup. + International Journal of Web Engineering and Technology. + + + + + + Drummond, + N. + , + + Rector, + A. + , + + Stevens, + R. + , + + Moulton, + G. + , + + Horridge, + M. + , + + Wang, + H.H. + , & + + Seidenberg, + J. + + ( + 2006). + Putting OWL in order: Patterns for sequences in OWL. In + + B. C. + Grau + , + + P. + Hitzler + , + + C. + Shankey + , & + + E. + Wallace + (Eds.), + Proceedings of the Workshop on OWL: Experiences and Directions (OWLED 2006), + Athens, GA. Retrieved from + http://sunsite.informatik.rwth‐aachen.de/Publications/CEUR‐WS/Vol‐216/submission_12.pdf + + + + + + Durand, + D.G. + + ( + 1994, October). + Palimpsest, a data model for revision control. Paper presented at the Workshop on Collaborative Editing Systems at the Computer Supported Cooperative Work Conference (CSCW94), Chapel Hill, NC. + + + + + + Durand, + D.G. + ( + 2008). + Palimpsest: Change‐oriented concurrency control for the support of collaborative applications. + Charleston, SC: + CreateSpace. + + + + + + Garlik, + S.H. + , & + + Seaborne, + A. + + ( + 2010). SPARQL 1.1 Query Language. W3C Working Draft, October 14, 2010, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/sparql11‐query/ + + + + + + Georg, + R. + , + + Schonefeld, + O. + , + + Trippel, + T. + , & + + Witt, + A. + + ( + 2010). + Sustainability of linguistic resources revisited. In + Proceedings of the International Symposium on XML for the Long Haul: Issues in the Long‐Term Preservation of XML. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://www.balisage.net/Proceedings/vol6/html/Witt01/BalisageVol6‐Witt01.html + + + + + + Goldfarb, + C.F. + ( + 1990). + The SGML Handbook. + New York, NY: + Oxford University Press. + + + + + + Horridge, + M. + , & + + Patel‐Schneider, + P. + + ( + 2009). OWL 2 Web Ontology Language: Manchester Syntax. W3C Working Group Note October 27, 2009, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/owl2‐manchester‐syntax/ + + + + + + Horrocks, + I. + , + + Patel‐Schneider, + P.F. + , + + Boley, + H. + , + + Tabet, + S. + , + + Grosof, + B. + , & + + Dean, + M. + + ( + 2004). SWRL: A Semantic Web rule language combining OWL and RuleML. W3C Member Submission, May 21, 2004, World Wide Web Consortium. Retrieved + http://www.w3.org/Submission/SWRL/ + + + + + + Huitfeldt, + C. + , & + + Sperberg‐McQueen, + C.M. + + ( + 2003). TexMECS: An experimental markup meta‐language for complex documents. Retrieved from + http://decentius.aksis.uib.no/mlcd/2003/Papers/texmecs.html + + + + + JTC1/SC34 WG 4. ( + 2008). ISO/IEC 29500‐1:2008—Information technology—Document description and processing languages—Office Open XML File Formats: Part 1. + Fundamentals and markup language reference. + Geneva, Switzerland: + International Organization for Standardization. + + + + + JTC1/SC34 WG 6. ( + 2006). + ISO/IEC 26300:2006—Information technology—Open document format for office applications (OpenDocument), Version 1.0. + Geneva, Switzerland: + International Organization for Standardization. + + + + + + Marcoux, + Y. + , & + + Rizkallah, + E. + + ( + 2009). + Intertextual semantics: A semantics for information design. + Journal of the American Society for Information Science and Technology, + 60( + 9), + 1895– + 1906. + + + + + + Marinelli, + P. + , + + Vitali, + F. + , & + + Zacchiroli, + S. + + ( + 2008). + Towards the unification of formats for overlapping markup. + New Review of Hypermedia and Multimedia, + 14( + 1), + 57– + 94. + + + + + + Nelson, + T. + ( + 1980). + Literary machines: The report on, and of, Project Xanadu concerning word processing, electronic publishing, hypertext, thinkertoys, tomorrow's intellectual ⋖ including knowledge, education and freedom. + Sausalito, CA: + Mindful Press. + + + + + + Peroni, + S. + , & + + Vitali, + F. + + ( + 2009). + Annotations with EARMARK for arbitrary, overlapping and out‐of order markup. In + + U.M. + Borghoff + & + + B. + Chidlovskii + (Eds.), + Proceedings of the 2009 ACM Symposium on Document Engineering (DocEng 2009) (pp. + 171– + 180). + New York, NY: + ACM. + + + + + + Portier, + P. + , & + + Calabretto, + S. + + ( + 2009). + Methodology for the construction of multi‐structured documents. In + Proceedings of Balisage: The Markup Conference 2009. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://balisage.net/Proceedings/vol3/html/Portier01/BalisageVol3‐Portier01.html + + + + + + Riggs, + K.R. + + ( + 2002). + XML and free text. + Journal of the American Society for Information Science and Technology, + 53( + 6), + 526– + 528. + + + + + + Salembier, + P. + , & + + Benitez, + A.B. + + ( + 2007). + Structure description tools. + Journal of the American Society for Information Science and Technology, + 58( + 9), + 1329– + 1337. + + + + + + Schmidt, + D. + + ( + 2009). + Merging multi‐version texts: A generic solution to the overlap problem. In + Proceedings of Balisage: The Markup Conference 2009. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://balisage.net/Proceedings/vol3/html/Schmidt01/BalisageVol3‐Schmidt01.html + + + + + + Schmidt, + D. + , & + + Colomb, + R. + + ( + 2009). + A data structure for representing multi‐version texts online. + Journal of Human–Computer Studies, + 67( + 6), + 497– + 514. + + + + + + Schonefeld, + O. + , & + + Witt, + A. + ( + 2006). + Towards validation of concurrent markup. In + Proceedings of the Extreme Markup Languages 2006. + Rockville, MD: + Mulberry Technologies. Retrieved from h + ttp://conferences.idealliance.org/extreme/html/2006/Schonefeld01/EML2006Schonefeld01.html + + + + + + Sperberg‐McQueen, + C.M. + + ( + 2006). + Rabbit/duck grammars: A validation method for overlapping structures. In + Proceedings of Extreme Markup Languages Conference 2006. + Rockville, MD: + Mulberry Technologies. Retrieved from + http://conferences.idealliance.org/extreme/html/2006/SperbergMcQueen01/EML2006SperbergMcQueen01.html + + + + + + Sperberg‐McQueen, + C.M. + , & + + Huitfeldt, + C. + + ( + 2004). + GODDAG: A data structure for overlapping hierarchies. In + + P.R. + King + & + + E.V. + Munson + (Eds.), + Proceeding of the 5th International Workshop on the Principles of Digital Document Processing (PODDP 2000) (pp. + 139– + 160). + Heidelberg, Germany: + Springer. + + + + + TEI Consortium. ( + 2005). TEI P5: Guidelines for electronic text encoding and interchange. Retrieved from + http://www.tei‐c.org/Guidelines/P5 + + + + + + Tennison, + J. + , & + + Piez, + W. + + ( + 2002, August). The Layered Markup and Annotation Language (LMNL). Paper resented at the Extreme Markup Languages Conference 2002, Montreal, Canada. + + + + + + Tummarello, + G. + , + + Morbidoni, + C. + , & + + Pierazzo, + E. + + ( + 2005). + Toward textual encoding based on RDF. In + + M. + Dobreva + & + + J. + Engelen + (Eds.), + Proceedings of the Ninth ICCC International Conference on Electronic Publishing (ELPUB2005). + Leuven, Belgium: + Peeters. + + + + + W3C OWL Working Group. ( + 2009). OWL 2 web ontology language document overview. W3C Recommendation, October 27, 2009, World Wide Web Consortium. Retrieved from + http://www.w3.org/TR/owl2‐overview/ + + + + +
diff --git a/xslt/Wileyml3g.xsl b/xslt/Wileyml3g.xsl new file mode 100644 index 0000000..d225f23 --- /dev/null +++ b/xslt/Wileyml3g.xsl @@ -0,0 +1,3401 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Version 0.6 générée le + + + + + 3.6 + + + + + + + + + + + + + + + + + + + + <xsl:variable name="ttl"> + <xsl:apply-templates select="."/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + + + + + + + + + + + + + + + + + + + <xsl:variable name="ttl"> + <xsl:apply-templates select="."/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + + + + + + + + + abbreviated + + + + + + + + <xsl:variable name="ttlsh"> + <xsl:apply-templates select="//a1:component/a1:header/a1:contentMeta/a1:titleGroup/a1:title[@type='short']"></xsl:apply-templates> + </xsl:variable> + <xsl:value-of select="normalize-space($ttlsh)"/> + + + + + + + + abbreviated + + + + + + + + <xsl:variable name="ttlsh"> + <xsl:apply-templates select="//a1:publicationMeta[@type='article']/a1:titleGroup/a1:title[@type='short']"></xsl:apply-templates> + </xsl:variable> + <xsl:value-of select="normalize-space($ttlsh)"/> + + + + + + + + + + translated + + + + + + + <xsl:variable name="ttl"> + <xsl:apply-templates select="."/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + + + + + + + + + + + + alternative + + CDATA + + + + + + + + + + + + + + + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:value-of select="normalize-space(./text())"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + + + + + + + + + alternative + + + + + + + CDATA + + + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:value-of select="normalize-space(//a1:component/a1:header/a1:contentMeta/a1:titleGroup/a1:title[@type='subtitle'])"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + + + + + + + + + + + + + corporate + + + + + + + + + + + + + + + + + + + + + + + + personal + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + Correspondence: + + + + + + + + + + + + Correspondence: + + + + + + + + + + + + + + text + author + + + + + + + text + editor + + + + + + + + + + + + + + + + + + + text + + + + + + + + article + review-article + editorial + book reviews + brief communication + article + brief communication + case report + abstract + + other + + + + + + + + + + + + + + + + + + + + + + + + + + Blackwell Publishing Ltd + + + + + + + text + + + + + + + + w3cdtf + + + + + + + w3cdtf + + + + + + + + w3cdtf + + + + + + w3cdtf + + + + + + w3cdtf + + + + + + + + + + + + + + + + + + + + + + + + + w3cdtf + + + + + + + + + + + + + + + + + + + + + code + iso639-2b + + + + + + code + iso639-2b + + + + + + + + + + + + code + rfc3066 + + + + + + code + iso639-2b + + + + + + aar + abk + afr + aka + alb + amh + ara + arg + hye + arm + asm + ava + ave + aym + aze + bak + bam + eus + baq + bel + ben + bih + bis + bod + bos + bre + bul + bur + mya + cat + ces + cha + che + chi + chu + chv + cor + cos + cre + cym + cze + dan + ger + div + dut + nld + dzo + eng + epo + est + ewe + fao + fij + fin + fre + fry + ful + kat + ger + gla + gle + glg + glv + gre + grn + guj + hat + hau + heb + her + hin + hmo + hrv + hun + ibo + isl + ido + iii + iku + ile + ina + ind + ipk + ita + jav + jpn + kal + kan + kas + kau + kaz + khm + kik + kin + kir + kom + kon + kor + kua + kur + lao + lat + lav + lim + lin + lit + ltz + lub + lug + mac + mkd + mah + mal + mri + mao + mar + may + msa + mlg + mlt + mon + nau + nav + nbl + nde + ndo + nep + nno + nob + nor + nya + oci + oji + ori + orm + oss + pan + fas + per + pli + pol + por + pus + que + roh + rum + run + rus + sag + san + scc + sin + slo + slk + slv + sme + smo + sna + snd + som + sot + spa + sqi + srd + srp + ssw + sun + swa + swe + tah + tam + tat + tel + tgk + tgl + tha + tib + tir + ton + tsn + tso + tuk + tur + twi + uig + ukr + urd + uzb + ven + vie + vol + wel + wln + wol + xho + yid + yor + zha + zho + zul + aar + abk + afr + aka + alb + amh + ara + arg + hye + arm + asm + ava + ave + aym + aze + bak + bam + eus + baq + bel + ben + bih + bis + bod + bos + bre + bul + bur + mya + cat + ces + cha + che + chi + chu + chv + cor + cos + cre + cym + cze + dan + ger + div + dut + nld + dzo + eng + epo + est + ewe + fao + fij + fin + fre + fry + ful + kat + gla + gle + glg + glv + gre + grn + guj + hat + hau + heb + her + hin + hmo + hrv + hun + ibo + isl + ido + iii + iku + ile + ina + ind + ipk + ita + jav + jpn + kal + kan + kas + kau + kaz + khm + kik + kin + kir + kom + kon + kor + kua + kur + lao + lat + lav + lim + lin + lit + ltz + lub + lug + mac + mkd + mah + mal + mri + mao + mar + may + msa + mlg + mlt + mon + nau + nav + nbl + nde + ndo + nep + nno + nob + nor + nya + oci + oji + ori + orm + oss + pan + fas + per + pli + pol + por + pus + que + roh + rum + run + rus + sag + san + scc + sin + slo + slk + slv + sme + smo + sna + snd + som + sot + spa + sqi + srd + srp + ssw + sun + swa + swe + tah + tam + tat + tel + tgk + tgl + tha + tib + tir + ton + tsn + tso + tuk + tur + twi + uig + ukr + urd + uzb + ven + vie + vol + wel + wln + wol + xho + yid + yor + zha + zho + zul + + + + + + + + + + + + code + rfc3066 + + + + + + code + iso639-2b + + + + + + aar + abk + afr + aka + alb + amh + ara + arg + hye + arm + asm + ava + ave + aym + aze + bak + bam + eus + baq + bel + ben + bih + bis + bod + bos + bre + bul + bur + mya + cat + ces + cha + che + chi + chu + chv + cor + cos + cre + cym + cze + dan + ger + div + dut + nld + dzo + eng + epo + est + ewe + fao + fij + fin + fre + fry + ful + kat + gla + gle + glg + glv + gre + grn + guj + hat + hau + heb + her + hin + hmo + hrv + hun + ibo + isl + ido + iii + iku + ile + ina + ind + ipk + ita + jav + jpn + kal + kan + kas + kau + kaz + khm + kik + kin + kir + kom + kon + kor + kua + kur + lao + lat + lav + lim + lin + lit + ltz + lub + lug + mac + mkd + mah + mal + mri + mao + mar + may + msa + mlg + mlt + mon + nau + nav + nbl + nde + ndo + nep + nno + nob + nor + nya + oci + oji + ori + orm + oss + pan + fas + per + pli + pol + por + pus + que + roh + rum + run + rus + sag + san + scc + sin + slo + slk + slv + sme + smo + sna + snd + som + sot + spa + sqi + srd + srp + ssw + sun + swa + swe + tah + tam + tat + tel + tgk + tgl + tha + tib + tir + ton + tsn + tso + tuk + tur + twi + uig + ukr + urd + uzb + ven + vie + vol + wel + wln + wol + xho + yid + yor + zha + zho + zul + aar + abk + afr + aka + alb + amh + ara + arg + hye + arm + asm + ava + ave + aym + aze + bak + bam + eus + baq + bel + ben + bih + bis + bod + bos + bre + bul + bur + mya + cat + ces + cha + che + chi + chu + chv + cor + cos + cre + cym + cze + dan + ger + div + dut + nld + dzo + eng + epo + est + ewe + fao + fij + fin + fre + fry + ful + kat + ger + gla + gle + glg + glv + gre + grn + guj + hat + hau + heb + her + hin + hmo + hrv + hun + ibo + isl + ido + iii + iku + ile + ina + ind + ipk + ita + jav + jpn + kal + kan + kas + kau + kaz + khm + kik + kin + kir + kom + kon + kor + kua + kur + lao + lat + lav + lim + lin + lit + ltz + lub + lug + mac + mkd + mah + mal + mri + mao + mar + may + msa + mlg + mlt + mon + nau + nav + nbl + nde + ndo + nep + nno + nob + nor + nya + oci + oji + ori + orm + oss + pan + fas + per + pli + pol + por + pus + que + roh + rum + run + rus + sag + san + scc + sin + slo + slk + slv + sme + smo + sna + snd + som + sot + spa + sqi + srd + srp + ssw + sun + swa + swe + tah + tam + tat + tel + tgk + tgl + tha + tib + tir + ton + tsn + tso + tuk + tur + twi + uig + ukr + urd + uzb + ven + vie + vol + wel + wln + wol + xho + yid + yor + zha + zho + zul + + + + + + + + + + + + + + + + + text/html + + + + + figures + + + + + + + + tables + + + + + + + + formulas + + + + + + + + references + + + + + + + + words + + + + + + + + + + + + + + graphical + + + + + synopsis + + + + + short + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + additional physical form + + + + + + + + + + + + + + + + content + + + + * + + + + + + + + + funding + + + + + + - + + + + + + + No. + + + + ; + + + + + + + + + + + + + + + + + + Keywords + + + + + + + + + + + + + + + host + + + + + <xsl:variable name="ttl"> + <xsl:apply-templates select="//a1:component/a1:header/a1:publicationMeta/a1:titleGroup/a1:title[@type='main']"/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + + + + + + + + + + + + + + + abbreviated + + + <xsl:variable name="abttl"> + <xsl:apply-templates select="//a1:component/a1:header/a1:publicationMeta/a1:titleGroup/a1:title[@type='short']"/> + </xsl:variable> + <xsl:value-of select="normalize-space($abttl)"/> + + + + + + + + personal + + + + + + + + + + Correspondence: + + + + + + + + + + Correspondence: + + + + + + + + + + + E-mail: + + + + + + + + + + + + text + editor + + + + + + + text + editor + + + + + + + + + + + + + + + + + + + corporate + + + + + + + + + + + + + + content + + + + + + + + + + Supporting Info Item: + + + + + + - + + + + + + + + + + + Index Terms + + + + + + + + + + + + + + + + + + + + + + + + + + article category + + + + + + + + + + + + + + + + + ISSN + + + + + + eISSN + + + + + + + DOI + + + + + + CODEN + + + + + + PublisherID + + + + + + + + + + + + + + + + + + + + + + + title + + + <xsl:variable name="dttl"> + <xsl:apply-templates select="//a1:publicationMeta[@level='part']/a1:titleGroup/a1:title[@type='specialIssueTitle']"/> + </xsl:variable> + <xsl:value-of select="normalize-space($dttl)"/> + <xsl:if test="//a1:publicationMeta[@level='part']/a1:titleGroup/a1:title[@type='specialIssueSubtitle'][string-length() > 0]"> + <xsl:text> : </xsl:text> + <xsl:variable name="ttl"> + <xsl:apply-templates select="//a1:publicationMeta[@level='part']/a1:titleGroup/a1:title[@type='specialIssueSubtitle']"/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + </xsl:if> + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + + supplement + + Suppl. no. + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + + + + + + + preceding + + + + + <xsl:variable name="ttl"> + <xsl:apply-templates select="//a1:header/a1:publicationMeta[@level='product']/a1:selfCitationGroup/a1:citation[@type='ancestor']/a1:journalTitle"/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + + + + + + + ISSN + + + + + + + + + end + + + + + + volume + + last vol. + + + + + + + + + issue + + last no. + + + + + + + + + + + + + reviewOf + + + + + + <xsl:value-of select="normalize-space(//a1:component/a1:header/a1:contentMeta/a1:titleGroup/a1:title/a1:citation/a1:bookTitle)"/> + + + + + + + + + + + + + + + + + + + + text + + + + + + + + + + + + ISBN + + + + + + + + + + + pages + + + + + + + + + + + + + + + pages + + + + + + + + + + + + + + + + + isReferencedBy + + + + + + + + + + + + + <xsl:variable name="ttl"> + <xsl:apply-templates select="a1:articleTitle "/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + + + + + + + + <xsl:variable name="ttl"> + <xsl:apply-templates select="a1:chapterTitle"/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + + + + + + + + + + <xsl:variable name="ttl"> + <xsl:apply-templates select="a1:journalTitle"/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + + + + + + + + + + + + <xsl:value-of select="normalize-space($ttl)"/> + + + + + + + + + + + + + + + + + + corporate + + + + + + + + + + journal article + + + + + book chapter + + + + + + + + + + + originalCategForm + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + journal + + + + + book + + + + + + + + + + + + + + text + + + + + + + + + + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + pages + + + + + + + + + + + + + + + + + + host + + + + <xsl:variable name="norm"> + <xsl:apply-templates select="a1:journalTitle | a1:bookTitle"/> + </xsl:variable> + <xsl:value-of select="normalize-space($norm)"/> + + + + + + + + + + + + + + + + + + + + + + + + text + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + + ISSN + + + + + + + + + end + + + + + + volume + + last vol. + + + + + + + + + issue + + last no. + + + + + + + + + + + + + + + + + + istex + + + + + + DOI + + + + + + ArticleID + + + + + + + + use and reproduction + + + + + + + + + + + + use and reproduction + + copyrightorrespondence address: + + + + + + + + , + + + + + + + + + + + , + + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + + + + + + + + + + + + Correspondence address: + + + + + + + + + + + + + + , + + + + + + + + + + + , + + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Current Address: + + + + + + + + , + + + + + + + + + + + , + + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + + + + + + + + + + + + Current Address: + + + + + + + + + + + + + + , + + + + + + + + + + + , + + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + , + + + + + + + + + + + , + + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + , + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Note: + + + + + + + + + + + + + + + + + + + + + + + + personal + + + termsOfAddress + + + + + + + + + given + + + + + + + + + + family + + + + + + + + + + family + + + + + + + + + + + + + + + + + + + + + + + + + termsOfAddress + + + + + + + + + termsOfAddress + + + + + + + + + text + author + + + + + + + personal + + + termsOfAddress + + + + + + + + + given + + + + + + + + + + family + + + + + + + + + + family + + + + + + + + + + + + + + + + + + + + + + + + + termsOfAddress + + + + + + + + + termsOfAddress + + + + + + + + + text + editor + + + + + + + + + termsOfAddress + + + + + + + + + given + + + + + + + + + + + + + + + + family + + + + + + + + + + family + + + + + + + + + + + + + + + + + + + + + + + + + termsOfAddress + + + + + + + + + termsOfAddress + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/xslt/journalpublishing.xsl b/xslt/journalpublishing.xsl new file mode 100644 index 0000000..d93d493 --- /dev/null +++ b/xslt/journalpublishing.xsl @@ -0,0 +1,5213 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Version 0.1 générée le + + + + + 3.6 + + + + + + + + + + + + + + + + + + + + + + + + + + <xsl:variable name="ttl"> + <xsl:apply-templates select="//article/front/article-meta/title-group/article-title"/> + </xsl:variable> + <xsl:value-of select="normalize-space($ttl)"/> + <xsl:if test="//response"> + <xsl:if test="//article-categories/subj-group/subject[string-length() > 0]"> + <xsl:for-each select="//article-categories/subj-group"> + <xsl:if test="@subj-group-type='heading'"> + <xsl:text> [</xsl:text> + <xsl:value-of select="normalize-space(.)"/> + <xsl:text>]</xsl:text> + </xsl:if> + </xsl:for-each> + </xsl:if> + </xsl:if> + + + + + + + + + + + alternative + + + + + + + + + + + + + + + + CDATA + + + + <![CDATA[ + + ]]> + + + + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:choose> + <xsl:when test="//article/front/article-meta/title-group/article-title/fn"> + <xsl:value-of select="normalize-space(//article/front/article-meta/title-group/article-title/text())"/> + </xsl:when> + <xsl:otherwise> + <xsl:value-of select="normalize-space(//article/front/article-meta/title-group/article-title)"/> + </xsl:otherwise> + </xsl:choose> + <xsl:if test="//response"> + <xsl:if test="//article-categories/subj-group/subject[string-length() > 0]"> + <xsl:for-each select="//article-categories/subj-group"> + <xsl:if test="@subj-group-type='heading'"> + <xsl:text> [</xsl:text> + <xsl:value-of select="normalize-space(.)"/> + <xsl:text>]</xsl:text> + </xsl:if> + </xsl:for-each> + </xsl:if> + </xsl:if> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + + + + <![CDATA[ + + ]]> + + + + + + + translated + + + + + + + <xsl:value-of select="normalize-space(trans-title)"/> + + + + + + + + + + translated + + + + + + + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:value-of select="normalize-space(trans-title)"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + + + + <![CDATA[ + + ]]> + + + + + + + + + translated + + + + + + + <xsl:value-of select="normalize-space(title)"/> + + + + + + + + + + translated + + + + + + + <xsl:text disable-output-escaping="yes"><![CDATA[</xsl:text> + <xsl:value-of select="normalize-space(title)"/> + <xsl:text disable-output-escaping="yes">]]></xsl:text> + + + + <![CDATA[ + + ]]> + + + + + + + + + + + <xsl:value-of select="//article-meta/article-categories/subj-group[@subj-group-type='heading']/subject"/> + + + + + + + + + + + + + + + + corporate + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + personal + + + + termsOfAddress + + + + + + termsOfAddress + + + + + + + given + + + + + + given + + + + + family + + + + + + termsOfAddress + + + + + + termsOfAddress + + + + + + + + + , + + + + + + + + + + + + + + + + + + + + + + + + + , + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + E-mail: + + + + + + E-mail: + + + + + + + + + E-mail: + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + personal + + + personal + + + corporate + + + + corresp + + + + + + + + + + + termsOfAddress + + + + + + termsOfAddress + + + + + + + given + + + + + + given + + + + + + family + + + + + + termsOfAddress + + + + + + termsOfAddress + + + + + + + + + + + + + + + + + + + , + + + + + + , + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + personal + + + + termsOfAddress + + + + + + + given + + + + + family + + + + + termsOfAddress + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + E-mail: + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + text + + + + + + + + abstract + other + other + research-article + book reviews + other + brief communication + other + case report + other + other + other + other + editorial + other + other + review-article + other + other + other + other + other + other + other + brief communication + other + other + research-article + other + review-article + other + + other + + + + + + + + + + + + + + + + + + + + + + + + + + + + + heading + + + + + + + + + + + heading + + + + + + + + + + + + + + + + + JournalSubjectCodes + + + + + + + + + + + + + + + + + + + . + + + + + + + + + + + + + + + + + text + + + + + + + + + w3cdtf + + + + + + + -01 + -02 + -03 + -04 + -05 + -06 + -07 + -08 + -09 + -10 + -11 + -12 + -01 + -02 + -03 + -04 + -05 + -06 + -07 + -08 + -09 + -10 + -11 + -12 + + - + + + + + + + - + + + + + + + + + w3cdtf + + + + + + + -01 + -02 + -03 + -04 + -05 + -06 + -07 + -08 + -09 + -10 + -11 + -12 + -01 + -02 + -03 + -04 + -05 + -06 + -07 + -08 + -09 + -10 + -11 + -12 + + - + + + + + - + + + + + + + + + + w3cdtf + + + + + + + -01 + -02 + -03 + -04 + -05 + -06 + -07 + -08 + -09 + -10 + -11 + -12 + -01 + -02 + -03 + -04 + -05 + -06 + -07 + -08 + -09 + -10 + -11 + -12 + - + + + + + - + + + + + + + + + w3cdtf + + + + + + + + + + + + code + iso639-2b + + + + + + + code + iso639-2b + + + + + + aar + abk + afr + aka + alb + amh + ara + arg + hye + arm + asm + ava + ave + aym + aze + bak + bam + eus + baq + bel + ben + bih + bis + bod + bos + bre + bul + bur + mya + cat + cha + che + chi + chu + chv + cor + cos + cre + cym + cze + dan + div + dut + nld + dzo + eng + epo + est + ewe + fao + fij + fin + fre + fry + ful + kat + ger + gla + gle + glg + glv + gre + grn + guj + hat + hau + heb + her + hin + hmo + hrv + hun + ibo + isl + ido + iii + iku + ile + ina + ind + ipk + ita + jav + jpn + kal + kan + kas + kau + kaz + khm + kik + kin + kir + kom + kon + kor + kua + kur + lao + lat + lav + lim + lin + lit + ltz + lub + lug + mac + mkd + mah + mal + mri + mao + mar + may + msa + mlg + mlt + mon + nau + nav + nbl + nde + ndo + nep + nno + nob + nor + nya + oci + oji + ori + orm + oss + pan + fas + per + pli + pol + por + pus + que + roh + rum + run + rus + sag + san + scc + sin + slo + slk + slv + sme + smo + sna + snd + som + sot + spa + sqi + srd + srp + ssw + sun + swa + swe + tah + tam + tat + tel + tgk + tgl + tha + tib + tir + ton + tsn + tso + tuk + tur + twi + uig + ukr + urd + uzb + ven + vie + vol + wel + wln + wol + xho + yid + yor + zha + zho + zul + aar + abk + afr + aka + alb + amh + ara + arg + hye + arm + asm + ava + ave + aym + aze + bak + bam + eus + baq + bel + ben + bih + bis + bod + bos + bre + bul + bur + mya + cat + cha + che + chi + chu + chv + cor + cos + cre + cym + cze + dan + div + dut + nld + dzo + eng + epo + est + ewe + fao + fij + fin + fre + fry + ful + kat + ger + gla + gle + glg + glv + gre + grn + guj + hat + hau + heb + her + hin + hmo + hrv + hun + ibo + isl + ido + iii + iku + ile + ina + ind + ipk + ita + jav + jpn + kal + kan + kas + kau + kaz + khm + kik + kin + kir + kom + kon + kor + kua + kur + lao + lat + lav + lim + lin + lit + ltz + lub + lug + mac + mkd + mah + mal + mri + mao + mar + may + msa + mlg + mlt + mon + nau + nav + nbl + nde + ndo + nep + nno + nob + nor + nya + oci + oji + ori + orm + oss + pan + fas + per + pli + pol + por + pus + que + roh + rum + run + rus + sag + san + scc + sin + slo + slk + slv + sme + smo + sna + snd + som + sot + spa + sqi + srd + srp + ssw + sun + swa + swe + tah + tam + tat + tel + tgk + tgl + tha + tib + tir + ton + tsn + tso + tuk + tur + twi + uig + ukr + urd + uzb + ven + vie + vol + wel + wln + wol + xho + yid + yor + zha + zho + zul + + + + + + + + + + + code + rfc3066 + + + + + + aa + ab + af + ak + sq + am + ar + an + hy + hy + as + av + ae + ay + az + ba + bm + eu + eu + be + bn + bh + bi + bo + bs + br + bg + my + my + ca + cs + ch + ce + zh + cu + cv + kw + co + cr + cy + cs + da + de + dv + nl + nl + dz + en + eo + et + ee + fo + fj + fi + fr + fr + fy + ff + ka + ka + de + gd + ga + gl + gv + el + el + gn + gu + ht + ha + he + hz + hi + ho + hr + hr + hu + ig + is + is + io + ii + iu + ie + ia + id + ik + it + jv + ja + kl + kn + ks + kr + kk + km + ki + rw + ky + kv + kg + ko + kj + ku + lo + la + lv + li + ln + lt + lb + lu + lg + mk + mk + mh + ml + mi + mi + mr + ms + ms + mg + mt + mn + na + nv + nr + nd + ng + ne + nn + nb + no + ny + oc + oj + or + om + os + pa + fa + fa + pi + pl + pt + ps + qu + rm + ro + ro + rn + ru + sg + sa + sr + si + sk + sk + sl + se + sm + sn + sd + so + st + es + sq + sc + sr + ss + su + sw + sv + ty + ta + tt + te + tg + tl + th + bo + ti + to + tn + ts + tk + tr + tw + ug + uk + ur + uz + ve + vi + vo + cy + wa + wo + xh + yi + yo + za + zh + zu + + + + + + + + code + rfc3066 + + + + + + + + + + text/html + + + + + + words + + + + + + + + + figures + + + + + + + + + tables + + + + + + + + + equations + + + + + + + + + references + + + + + + + edition + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + footnotes + + + + + + + + author-notes + + + + + + + + + + + + + + + foot-notes + + + + + + + + + + + + + + + content + Correction: + + + + + + + content + Correction + + Volume( + + ) + + + + Page( + + ) + + + + + + + + + + + + + + + + + + + + + + + + + + + + Keywords + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Keywords + + + + + + + + + + + + + + + + + + + + host + + + + + + <xsl:value-of select="normalize-space(//journal-meta/journal-title)"/> + + + + + + + + + + + + <xsl:value-of select="normalize-space(//journal-meta/journal-title-group/journal-title)"/> + + + + + + + + + + + + + + <xsl:value-of select="normalize-space(//journal-meta/abbrev-journal-title[@abbrev-type='full'])"/> + + + + + <xsl:value-of select="normalize-space(//journal-meta/journal-title-group/abbrev-journal-title[@abbrev-type='full'])"/> + + + + + + <xsl:value-of select="normalize-space(//journal-id[@journal-id-type='publisher'])"/> + + + + + + + + + + + + + + + + + abbreviated + + <xsl:value-of select="normalize-space(//article/front/journal-meta/abbrev-journal-title)"/> + + + + + + abbreviated + + <xsl:value-of select="normalize-space(//journal-id[@journal-id-type='nlm-ta'])"/> + + + + <xsl:value-of select="normalize-space(//journal-id[@journal-id-type='pmc'])"/> + + + + + + + abbreviated + + <xsl:value-of select="normalize-space(//journal-meta/abbrev-journal-title[@abbrev-type='pubmed'])"/> + + + + <xsl:value-of select="normalize-space(//journal-meta/abbrev-journal-title[@abbrev-type='publisher'])"/> + + + + + + + + abbreviated + + <xsl:value-of select="normalize-space(//journal-meta/abbrev-journal-title[@abbrev-type='publisher'])"/> + + + + + + + + + + + . + + + + + + text + + + + + + + + + + ISSN + + + + + + eISSN + + + + + + ISBN-10 + + + + + + ISBN-13 + + + + + + JournalID + + + + + + JournalID-hwp + + + + + + JournalID-nlm-ta + + + + + + + + + + + + + + + + + + + + + + + + + + title + + <xsl:value-of select="normalize-space(//article-meta/issue-title)"/> + + + + + + volume + + vol. + + + + + + + + + + + + supplement + + + + + + issue + + no. + + + + + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + reviewOf + + + + + + + + related-articleID + + + + + + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + pages + + + + + + + + + + + + + + + + reviewOf + + + + + + + <xsl:value-of select="normalize-space(//article/front/article-meta/product/source)"/> + + + + + + + personal + + + termsOfAddress + + + + + + + given + + + + + + family + + + + + termsOfAddress + + + + + + + + + + + + <xsl:value-of select="normalize-space(//article/front/article-meta/product/source)"/> + + + + + + + <xsl:value-of select="normalize-space(//article/front/article-meta/product)"/> + + + + + + + + personal + + + + termsOfAddress + + + + + + + given + + + + + + family + + + + + termsOfAddress + + + + + + + + + + + + + + + . + + + + + + + text + + + + + + + + + ISBN + + + + + + + + + + + + + pages + + + + + + + + + + + + pages + + + + + + + + + + + + + + + references + + + + + <xsl:value-of select="normalize-space(front/article-meta/title-group/article-title)"/> + + + + + + + + + + + + + + + + + + + personal + + + + termsOfAddress + + + + + + termsOfAddress + + + + + + + + given + + + + + + + + given + + + + + + family + + + + + + termsOfAddress + + + + + + termsOfAddress + + + + + + + + + + + + + + + + + + + + + + + + + + + + , + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + E-mail: + + + + + + + + + E-mail: + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + . + + + + + + + text + + + + + + + + + w3cdtf + + + + + + + -01 + -02 + -03 + -04 + -05 + -06 + -07 + -08 + -09 + -10 + -11 + - + + + + + - + + + + + + + + + + + + + host + + + <xsl:value-of select="normalize-space(front/journal-meta/journal-title)"/> + + + + + + + + + + + abbreviated + + <xsl:value-of select="normalize-space(front/journal-meta/journal-id[@journal-id-type='nlm-ta'])"/> + + + + <xsl:value-of select="normalize-space(front/journal-meta/journal-id[@journal-id-type='pmc'])"/> + + + + + + + abbreviated + + <xsl:value-of select="normalize-space(front/journal-meta/journal-meta/abbrev-journal-title[@abbrev-type='pubmed'])"/> + + + + <xsl:value-of select="normalize-space(front/journal-meta/abbrev-journal-title[@abbrev-type='publisher'])"/> + + + + + + + + abbreviated + + <xsl:value-of select="normalize-space(front/journal-meta/abbrev-journal-title[@abbrev-type='publisher'])"/> + + + + + + + + + + + + + + + + + + + + JournalSubjectCodes + + + + + + + + + + + + + + + + ISSN + + + + + + eISSN + + + + + + DOI + + + + + + JournalID + + + + + + JournalID-hwp + + + + + + JournalID-nlm-ta + + + + + + + + + + + + volume + + + + + + + + issue + + + + + + + + pages + + + + + + + + pages + + + + + + + + + + use and reproduction + copyright + + + + + + + + + isReferencedBy + + + + + + + + + + <xsl:value-of select="normalize-space(nlm-citation/article-title)"/> + + + + + + + <xsl:value-of select="normalize-space(mixed-citation)"/> + + + + + + + <xsl:value-of select="normalize-space(citation[@citation-type='other'])"/> + + + + + + + + <xsl:value-of select="normalize-space(nlm-citation/source)"/> + + + + + + + + + + personal + + + + termsOfAddress + + + + + + + + given + + + + + + family + + + + + + + + + + + + corporate + + + + + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + + + host + + + + + <xsl:value-of select="normalize-space(nlm-citation/source)"/> + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + isReferencedBy + + + + + + + + + + <xsl:value-of select="normalize-space(element-citation/article-title)"/> + + + + + + + <xsl:value-of select="normalize-space(mixed-citation)"/> + + + + + + + <xsl:value-of select="normalize-space(citation[@citation-type='other'])"/> + + + + + + + + <xsl:value-of select="normalize-space(element-citation/source)"/> + + + + + + + + + + personal + + + + termsOfAddress + + + + + + + + given + + + + + + family + + + + + + + + + + + corporate + + + + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + + + host + + + + + <xsl:value-of select="normalize-space(element-citation/source)"/> + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + + + + + + + isReferencedBy + + + + + + + + + + <xsl:value-of select="normalize-space(nlm-citation/article-title)"/> + + + + + + + <xsl:value-of select="normalize-space(nlm-mixed-citation)"/> + + + + + + + + + + <xsl:value-of select="normalize-space(nlm-citation/source)"/> + + + + + + + <xsl:value-of select="normalize-space(nlm-citation/text())"/> + + + + + + + + + + <xsl:value-of select="normalize-space(element-citation/article-title)"/> + + + + + + + <xsl:value-of select="normalize-space(element-mixed-citation)"/> + + + + + + + <xsl:value-of select="normalize-space(element-citation[@citation-type='other']/text())"/> + + + + + + + + <xsl:value-of select="normalize-space(element-citation/source)"/> + + + + + + + <xsl:value-of select="normalize-space(element-citation/text())"/> + + + + + + + + + + <xsl:value-of select="normalize-space(citation/article-title)"/> + + + + + + + <xsl:value-of select="normalize-space(mixed-citation)"/> + + + + + + + <xsl:value-of select="normalize-space(citation[@citation-type='other']/text())"/> + + + + + + + + <xsl:value-of select="normalize-space(citation/source)"/> + + + + + + + <xsl:value-of select="normalize-space(citation/text())"/> + + + + + + + + + + + + text + + + + + + + + + personal + + + + termsOfAddress + + + + + + + + given + + + + + + family + + + + + + + + + + text + + + + + + + + + + corporate + + + + + + + + + + + + + + text + + + + + + + + personal + + + + termsOfAddress + + + + + + + + given + + + + + + family + + + + + + + + + + text + + + + + + + + + corporate + + + + + + + + + + + + author + + + editor + + + + + personal + + + + termsOfAddress + + + + + + + + given + + + + + + family + + + + + + + + + text + + + + + + + + corporate + + + + + + + + + + + corporate + + + + + + + + + + corporate + + + + + + + + + + corporate + + + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + host + + + + + <xsl:value-of select="normalize-space(nlm-citation/source)"/> + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + + pages + + + + + - + + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + host + + + + + <xsl:value-of select="normalize-space(element-citation/source)"/> + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + + pages + + + + + - + + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + host + + + + + <xsl:value-of select="normalize-space(citation/source)"/> + + + + + + + + + + + . + + + + + + + text + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + + pages + + + + + - + + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + + + + + + volume + + vol. + + + + + + + + + issue + + no. + + + + + + + + + pages + + + + + + + + + + + + + + + + + + + + + + + + istex + + + + + + + DOI + + + + + + DOI + + + + + + + URI + + + + + + filenameID + + + + + + SICI + + + + + + URI + + + + + + original-pdf + + + + + + href + + + + + + ArticleID + + + + + + PMID + + + + + + + Related-article-Href + + + + + + + related-article-ID + + + + + + + use and reproduction + + + + + + + + + use and reproduction + Copyright + + + + + + use and reproduction + Copyright + + + + + + + + + + + + OUP + + + + + + + + + + + + + + + corporate + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + E-mail: + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + personal + + + + termsOfAddress + + + + + + + + given + + + + + + given + + + + + + family + + + + + termsOfAddress + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + + + + E-mail: + + + + + + + + + E-mail: + + + + + + + + + E-mail: + + + + + + + + + + + + + + + + + + text + author + + + + + + + + text + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +