Commit d793c76b authored by Gerrit Hübbers's avatar Gerrit Hübbers 🃏
Browse files

Improve GIGA metadata transformation

parent 5ac200c3
package org.gesis.dda.publishing.domain.impl;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
......@@ -86,7 +87,7 @@ public class XsltTransformerOaiPmhBundlesStreamSourceTest {
}
@Test
public void getSinglePublication() throws IOException {
public void getSinglePublicationAfsp() throws IOException {
Map<String, String> map = new HashMap<>();
......@@ -117,6 +118,8 @@ public class XsltTransformerOaiPmhBundlesStreamSourceTest {
assertTrue( metadata.contains( new SimpleMetadatum("dc.title.alternative", "en", "Review: Manfred Schulz (ed.): Entwicklungsträger in der DR Kongo. Entwicklungen in Politik, Wirtschaft, Religion, Zivilgesellschaft und Kultur (2007)") ) );
assertTrue( metadata.contains( new SimpleMetadatum("dc.source.issuetopic", "New Nationalism and Xenophobia in Africa") ) );
assertTrue( metadata.contains( new SimpleMetadatum("dc.source.pageinfo", "125-127") ) );
assertTrue( metadata.contains( new SimpleMetadatum("internal.identifier.ddc", "303") ) );
assertTrue( metadata.contains( new SimpleMetadatum("internal.identifier.ddc", "324") ) );
bundle = bss.getBundle("oai:hup.sub.uni-hamburg.de.giga:article/122");
......@@ -193,6 +196,13 @@ public class XsltTransformerOaiPmhBundlesStreamSourceTest {
assertTrue( metadata.contains( new SimpleMetadatum("dc.contributor.author", "Jong, Piter de") ) ); // yes, double space :(
assertTrue( metadata.contains( new SimpleMetadatum("dc.contributor.author", "Greeven, Mark J.") ) ); // yes, double space :(
assertTrue( metadata.contains( new SimpleMetadatum("dc.contributor.author", "Ebbers, Haico") ) );
bundle = bss.getBundle("oai:hup.sub.uni-hamburg.de.giga:article/996");
metadata = bundle.getMetadata();
metadata.stream().map(Object::toString).forEach(LOG::info);
assertTrue( metadata.contains( new SimpleMetadatum("dc.date.issued", "2016") ) );
assertFalse( metadata.contains( new SimpleMetadatum("dc.date.issued", "2017") ) );
}
@Test
......@@ -259,5 +269,28 @@ public class XsltTransformerOaiPmhBundlesStreamSourceTest {
assertTrue( metadata.contains( new SimpleMetadatum("dc.description.review", "2") ) );
assertTrue( metadata.contains( new SimpleMetadatum("dc.source.issuetopic", "Subnational Authoritarianism and Democratization in Latin America") ) ); // yes, double space :(
assertTrue( metadata.contains( new SimpleMetadatum("dc.source.pageinfo", "113-153") ) );
bundle = bss.getBundle("oai:hup.sub.uni-hamburg.de.giga:article/1002");
metadata = bundle.getMetadata();
metadata.stream().map(Object::toString).forEach(LOG::info);
assertTrue( metadata.contains( new SimpleMetadatum("dc.date.issued", "2016") ) );
assertFalse( metadata.contains( new SimpleMetadatum("dc.date.issued", "2017") ) );
}
@Test
public void matchDdcString() {
final String NOT_DDC_NUMBERS_PATTERN = "(\\s?\\D+;?)+";
String underTest = "303; 324; 311";
assertFalse( underTest.matches(NOT_DDC_NUMBERS_PATTERN) );
underTest = "303";
assertFalse( underTest.matches(NOT_DDC_NUMBERS_PATTERN) );
underTest = "Gesellschaft; Entwicklung";
assertTrue( underTest.matches(NOT_DDC_NUMBERS_PATTERN) );
underTest = "Gesellschaft";
assertTrue( underTest.matches(NOT_DDC_NUMBERS_PATTERN) );
}
}
......@@ -61,7 +61,6 @@
<!-- dc:source may contain page information ... if it exists and follows the pattern 123(-125), it is after the last '; ' occurrence -->
<xsl:variable name="pageinfo" select="tokenize(text(),'; ')[last()]" />
<!-- <xsl:message terminate="no">pageinfo=<xsl:value-of select="$pageinfo"/></xsl:message> -->
<xsl:if test="$pageinfo != '' and matches($pageinfo, '^\d+(-\d+)?')">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.source.pageinfo'"/>
......@@ -70,6 +69,28 @@
</xsl:call-template>
</xsl:if>
<xsl:analyze-string select="text()" regex=".*\((\d+)\).*">
<xsl:matching-substring>
<!-- <xsl:message terminate="no">issued=<xsl:value-of select="regex-group(1)"/></xsl:message> -->
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.date.issued'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="regex-group(1)"/>
</xsl:call-template>
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:template>
<xsl:template match="dc:subject">
<!-- querying for metadataPrefix=oai_dc provides language keys -->
<!-- replacing commas with semicolons -->
<xsl:variable name="subjects" select="replace( text(), ',', ';')" />
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.subject.other'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="$subjects"/>
</xsl:call-template>
</xsl:template>
<xsl:template name="new-metadatum">
......
......@@ -126,7 +126,7 @@
<xsl:template match="dc:subject[@xsi:type='dcterms:DDC']">
<!-- e.g., here text() evaluates to '300, 303, 320, 324' -->
<xsl:for-each select="tokenize(text(), ', ')" >
<xsl:for-each select="tokenize(text(), '; ')" >
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'internal.identifier.ddc'"/>
<xsl:with-param name="language" select="''"/>
......@@ -135,17 +135,6 @@
</xsl:for-each>
</xsl:template>
<xsl:template match="dc:subject[@xsi:type='xMetaDiss:noScheme']">
<!-- querying for metadataPrefix=oai_dc provides language keys -->
<!-- replacing commas with semicolons -->
<xsl:variable name="subjects" select="replace( text(), ',', ';')" />
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.subject.other'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="$subjects"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="dcterms:abstract">
<xsl:variable name="language" select="@lang" />
<xsl:call-template name="new-metadatum">
......@@ -155,14 +144,6 @@
</xsl:call-template>
</xsl:template>
<xsl:template match="dcterms:issued">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.date.issued'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="substring( text(), 1, 4)"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="dc:identifier[@xsi:type='urn:nbn']">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.identifier.urn'"/>
......
......@@ -61,7 +61,7 @@
<!-- dc:source may contain page information ... if it exists and follows the pattern 123(-125), it is after the last '; ' occurrence -->
<xsl:variable name="pageinfo" select="tokenize(text(),'; ')[last()]" />
<!-- <xsl:message terminate="no">pageinfo=<xsl:value-of select="$pageinfo"/></xsl:message> -->
<xsl:if test="$pageinfo != '' and matches($pageinfo, '^\d+(-\d+)?')">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.source.pageinfo'"/>
......@@ -70,6 +70,16 @@
</xsl:call-template>
</xsl:if>
<xsl:analyze-string select="text()" regex=".*\((\d+)\).*">
<xsl:matching-substring>
<!-- <xsl:message terminate="no">issued=<xsl:value-of select="regex-group(1)"/></xsl:message> -->
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.date.issued'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="regex-group(1)"/>
</xsl:call-template>
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:template>
<xsl:template name="new-metadatum">
......
......@@ -147,14 +147,6 @@
</xsl:call-template>
</xsl:template>
<xsl:template match="dcterms:issued">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.date.issued'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="substring( text(), 1, 4)"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="dc:identifier[@xsi:type='urn:nbn']">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.identifier.urn'"/>
......
......@@ -70,6 +70,17 @@
</xsl:call-template>
</xsl:if>
<xsl:analyze-string select="text()" regex=".*\((\d+)\).*">
<xsl:matching-substring>
<!-- <xsl:message terminate="no">issued=<xsl:value-of select="regex-group(1)"/></xsl:message> -->
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.date.issued'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="regex-group(1)"/>
</xsl:call-template>
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:template>
<xsl:template name="new-metadatum">
......
......@@ -147,14 +147,6 @@
</xsl:call-template>
</xsl:template>
<xsl:template match="dcterms:issued">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.date.issued'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="substring( text(), 1, 4)"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="dc:identifier[@xsi:type='urn:nbn']">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.identifier.urn'"/>
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment