Commit 46c5f132 authored by Gerrit Hübbers's avatar Gerrit Hübbers 🃏
Browse files

Merge branch 'FEATURE-fqs-harvesting'

parents 35b57780 e5df540d
......@@ -7,6 +7,7 @@ import org.gesis.dda.feeder.FeedingContext;
import org.gesis.dda.feeder.NextIncrementalHarvestingIntervalStrategy;
import org.gesis.dda.filter.BundleFilter;
import org.gesis.dda.filter.impl.AcceptAnyBundleFilter;
import org.gesis.dda.filter.impl.Fqs2SsoarBundleFilter;
import org.gesis.dda.filter.impl.SsoarTargetRepositoryBundleFilter;
import org.gesis.dda.filter.impl.Wbv2SsoarBundleFilter;
import org.gesis.dda.persist.PersistableHelper;
......@@ -116,19 +117,23 @@ public class FeedingContextFactory {
String sourceId = source.getReference();
String targetId = target.getReference();
String key = sourceId + "$$" + targetId;
log.debug("Looking up MetadataTransformer for key {}", key);
switch (key) {
case "http://dspace.wbv.de:8888/oai/request$$ssoar":
log.debug("using http://dspace.wbv.de:8888/oai/request$$ssoar");
result = new Wbv2SsoarBundleFilter();
break;
case "http://www.qualitative-research.net/index.php/fqs/oai/$$ssoar":
log.info("using Fqs2SsoarBundleFilter");
result = new Fqs2SsoarBundleFilter();
break;
default:
log.debug("using default IdentityMetadataTransformer");
result = new AcceptAnyBundleFilter();
break;
}
return result;
}
......
package org.gesis.dda.filter.impl;
import java.util.Set;
import org.gesis.dda.filter.BundleFilter;
import org.gesis.dda.publishing.domain.Bundle;
import org.gesis.dda.publishing.domain.Metadatum;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Fqs2SsoarBundleFilter implements BundleFilter {
private final static Logger LOG = LoggerFactory.getLogger(Fqs2SsoarBundleFilter.class);
/**
* SSOAR already has everything up to and including dc.source.volume=17 dc.source.issue=3 ->
* there are only three issues in volume=17 -> therefore: harvest everything starting with volume>=18
*/
@Override
public boolean test(Bundle bundle) {
boolean result;
Set<Metadatum> metadata = bundle.getMetadata();
Metadatum volume = metadata.stream().filter( m -> m.getKey().equals("dc.source.volume") ).findFirst().orElse(null);
if (null != volume) {
String volumeValueString = volume.getValue();
try {
int volumeValue = Integer.parseInt(volumeValueString);
if ( volumeValue >= 18 ) {
result = true;
}
else {
LOG.info("filtering away bundle.reference={}", bundle.getReference() );
result = false;
}
}
catch (NumberFormatException e) {
LOG.warn("unparsable dc.source.volume={} for bundle.reference={}", volumeValueString, bundle.getReference() );
result = true;
}
}
else {
result = true;
}
return result;
}
}
......@@ -71,7 +71,7 @@ public class Wbv2SsoarMetadataTransformer implements MetadataTransformer {
else if ( originalMetadatumKey.matches("dc.identifier.doi") ) {
String cleanedMetadatumValue = originalMetadatumValue;
if ( ! cleanedMetadatumValue.startsWith("http") ) {
cleanedMetadatumValue = "https://dx.doi.org/" + cleanedMetadatumValue;
cleanedMetadatumValue = "https://doi.org/" + cleanedMetadatumValue;
}
Metadatum newMetadatum = new SimpleMetadatum("dc.identifier.doi", cleanedMetadatumValue);
result.add(newMetadatum);
......
package org.gesis.dda.wizard.domain;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import java.io.Serializable;
import java.time.Instant;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import javax.persistence.CollectionTable;
import javax.persistence.Column;
import javax.persistence.ElementCollection;
import javax.persistence.Entity;
import javax.persistence.EnumType;
import javax.persistence.Enumerated;
import javax.persistence.FetchType;
import javax.persistence.GeneratedValue;
import javax.persistence.GenerationType;
import javax.persistence.Id;
import javax.persistence.JoinColumn;
import javax.persistence.ManyToOne;
import javax.persistence.Table;
import org.gesis.dda.publishing.domain.Metadatum;
import org.gesis.dda.publishing.domain.impl.JpaMetadatum;
import org.gesis.dda.wizard.domain.enumeration.BundleState;
......@@ -9,12 +27,8 @@ import org.gesis.dda.wizard.service.BundleService;
import org.hibernate.annotations.Cache;
import org.hibernate.annotations.CacheConcurrencyStrategy;
import javax.persistence.*;
import java.io.Serializable;
import java.time.Instant;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
/**
* A Bundle.
......@@ -45,7 +59,7 @@ public class Bundle implements Serializable, org.gesis.dda.publishing.domain.Bun
@Column(name = "last_harvesting_instant")
private Instant lastHarvestingInstant;
@ElementCollection(fetch = FetchType.EAGER)
@ElementCollection(fetch = FetchType.LAZY)
@CollectionTable(name="metadatum",
joinColumns=@JoinColumn(name="bundle_id")
)
......@@ -59,6 +73,7 @@ public class Bundle implements Serializable, org.gesis.dda.publishing.domain.Bun
this.id = id;
}
@Override
public String getReference() {
return reference;
}
......
......@@ -72,7 +72,7 @@ public class FeederService {
}
@Async
// @Transactional
@Transactional
public void feed(FeedingContext feedingContext) {
log.info("starting feeding");
String sourceReference = feedingContext.getSourceReference();
......
......@@ -3,15 +3,19 @@
xsi:noNamespaceSchemaLocation="http://ehcache.org/ehcache.xsd"
name="CM1"
updateCheck="false"
maxBytesLocalHeap="16M">
maxBytesLocalHeap="64M">
<!--
This is a default configuration, it is re-configured by the CacheConfiguration Spring Bean, using the
properties from the resources/config/*.yml files.
-->
<diskStore path="java.io.tmpdir"/>
<sizeOfPolicy maxDepth="100000"/>
<defaultCache
eternal="false"
overflowToDisk="false"
......
......@@ -125,9 +125,9 @@
</xsl:template>
<xsl:template match="nlm:article-id[@pub-id-type='doi']">
<xsl:variable name="doi" select="concat( 'http://dx.doi.org/', text() )" />
<xsl:variable name="doi" select="concat( 'https://doi.org/', text() )" />
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.identifier.urn'"/>
<xsl:with-param name="key" select="'dc.identifier.doi'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="$doi"/>
</xsl:call-template>
......
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="3.0"
xmlns:cc="http://www.d-nb.de/standards/cc/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcmitype="http://purl.org/dc/dcmitype/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:ddb="http://www.d-nb.de/standards/ddb/"
xmlns:dini="http://www.d-nb.de/standards/xmetadissplus/type/"
xmlns:doi="http://www.d-nb.de/standards/doi/"
xmlns:hdl="http://www.d-nb.de/standards/hdl/"
xmlns:mml="http://www.w3.org/1998/Math/MathML"
xmlns:nlm="http://dtd.nlm.nih.gov/publishing/2.3"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:pc="http://www.d-nb.de/standards/pc/"
xmlns:thesis="http://www.ndltd.org/standards/metadata/etdms/1.0/"
xmlns:urn="http://www.d-nb.de/standards/urn/"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:xMetaDiss="http://www.d-nb.de/standards/xmetadissplus/"
xmlns:xoai="http://www.lyncode.com/xoai"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:zoai="http://git.gesis.org/dda/zoai"
>
<xsl:output indent="yes"/>
<xsl:strip-space elements="*"/>
<!-- <xsl:message terminate="no">here is the tree at the current node <xsl:copy-of select="."/> </xsl:message> -->
<!-- <xsl:message terminate="no">here is some message for stderr, e.g. <xsl:value-of select='text()' /> </xsl:message> -->
<!--
<xsl:for-each select="./*">
<xsl:message terminate="no">a child=<xsl:value-of select="name()"/></xsl:message>
</xsl:for-each>
-->
<!-- override default template -->
<xsl:template match="*" />
<xsl:template match="/">
<xsl:element name="bundle">
<xsl:element name="metadata">
<xsl:apply-templates select="oai:OAI-PMH/oai:GetRecord/oai:record/oai:metadata/nlm:article" />
<xsl:apply-templates select="oai:OAI-PMH/oai:GetRecord/oai:record/oai:header" />
<!-- xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.type.stock'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'article'"/>
</xsl:call-template -->
<!-- xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.type.document'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'32'"/>
</xsl:call-template -->
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.source.journal'"/>
<xsl:with-param name="language" select="''"/>
<!-- xsl:with-param name="value" select="'Forum Qualitative Sozialforschung / Forum: Qualitative Social Research'"/ -->
<xsl:with-param name="value" select="'132'"/>
</xsl:call-template>
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'ssoar.contributor.institution'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'FQS'"/>
</xsl:call-template>
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.description.pubstatus'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'1'"/>
</xsl:call-template>
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.publisher.country'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'DEU'"/>
</xsl:call-template>
<!-- http://www.qualitative-research.net/index.php/fqs/about/editorialPolicies#peerReviewProcess -->
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.description.review'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'1'"/>
</xsl:call-template>
</xsl:element>
</xsl:element>
</xsl:template>
<xsl:template match="oai:OAI-PMH/oai:GetRecord/oai:record/oai:metadata/nlm:article">
<xsl:variable name="sanitizedLanguage" select="zoai:sanitizeLanguage( @xml:lang )" />
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.language'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="$sanitizedLanguage"/>
</xsl:call-template>
<xsl:apply-templates></xsl:apply-templates>
</xsl:template>
<xsl:template match="oai:OAI-PMH/oai:GetRecord/oai:record/oai:header">
<xsl:apply-templates></xsl:apply-templates>
</xsl:template>
<xsl:template match="oai:OAI-PMH/oai:GetRecord/oai:record/oai:metadata/nlm:article/nlm:front">
<xsl:apply-templates></xsl:apply-templates>
</xsl:template>
<xsl:template match="nlm:journal-meta">
<xsl:apply-templates></xsl:apply-templates>
</xsl:template>
<xsl:template match="nlm:issn[@pub-type='epub']">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.identifier.issn'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="text()"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="nlm:article-meta">
<xsl:apply-templates></xsl:apply-templates>
</xsl:template>
<xsl:template match="nlm:contrib-group">
<xsl:apply-templates></xsl:apply-templates>
</xsl:template>
<xsl:template match="nlm:contrib[@contrib-type='author']">
<xsl:variable name="fullName" select="concat( nlm:name/nlm:surname/text(), ', ', nlm:name/nlm:given-names/text() )" />
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.contributor.author'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="$fullName"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="nlm:article-id[@pub-id-type='doi']">
<xsl:variable name="doi" select="concat( 'https://doi.org/', text() )" />
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.identifier.doi'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="$doi"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="nlm:pub-date[@pub-type='collection']">
<xsl:apply-templates></xsl:apply-templates>
</xsl:template>
<xsl:template match="nlm:year">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.date.issued'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="text()"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="nlm:abstract-trans">
<xsl:variable name="sanitizedLanguage" select="zoai:sanitizeLanguage(@xml:lang)" />
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.description.abstract'"/>
<xsl:with-param name="language" select="$sanitizedLanguage"/>
<xsl:with-param name="value" select="nlm:p/text()"/>
</xsl:call-template>
<xsl:variable name="maybeUrn" select="zoai:extractUrnFromFqsAbstract( nlm:p/text() )" />
<xsl:if test="$maybeUrn != ''">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.identifier.urn'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="$maybeUrn"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
<xsl:template match="nlm:self-uri[ not(@content-type) ]">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.identifier.url'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="@xlink:href"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="nlm:permissions">
<xsl:apply-templates></xsl:apply-templates>
</xsl:template>
<xsl:template match="nlm:license">
<xsl:choose>
<xsl:when test="@xlink:href = 'http://creativecommons.org/licenses/by/4.0'">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.rights.licence'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'16'"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="@xlink:href = 'http://creativecommons.org/licenses/by-nc-nd/4.0'">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.rights.licence'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'20'"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="@xlink:href = 'http://creativecommons.org/licenses/by-nc/4.0'">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.rights.licence'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'32'"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.rights.licence'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="@xlink:href"/>
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:template>
<xsl:template match="nlm:issue">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.source.issue'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="text()"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="nlm:issue-title">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.source.issuetopic'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="text()"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="nlm:volume">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.source.volume'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="text()"/>
</xsl:call-template>
</xsl:template>
<xsl:template match="nlm:kwd-group">
<xsl:apply-templates></xsl:apply-templates>
</xsl:template>
<xsl:template match="nlm:kwd">
<xsl:variable name="sanitizedLanguage" select="zoai:sanitizeLanguage(../@xml:lang)" />
<xsl:variable name="subjects" select="replace( text(), ',', ';')" />
<xsl:if test="$subjects != ''">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.subject.other'"/>
<xsl:with-param name="language" select="$sanitizedLanguage"/>
<xsl:with-param name="value" select="$subjects"/>
</xsl:call-template>
</xsl:if>
</xsl:template>
<xsl:template match="/oai:OAI-PMH/oai:GetRecord/oai:record/oai:header/oai:setSpec">
<!-- http://www.qualitative-research.net/index.php/fqs/oai?verb=ListSets -->
<!-- http://www.qualitative-research.net/index.php/fqs/about/editorialPolicies#peerReviewProcess -->
<!-- guard against multiple set assignments such as 'fqsregion:ART' AND 'ec_fundedresources' -->
<xsl:if test="starts-with( text(), 'fqs:' )">
<xsl:choose>
<xsl:when test="text() = 'fqs:REV' or text() = 'fqs:U22' or text() = 'fqs:U23' or text() = 'fqs:U24'">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.type.stock'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'recension'"/>
</xsl:call-template>
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.type.document'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'23'"/>
</xsl:call-template>
</xsl:when>
<xsl:when test="text() = 'fqs:CONF'">
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.type.stock'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'article'"/>
</xsl:call-template>
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.type.document'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'16'"/>
</xsl:call-template>
</xsl:when>
<xsl:otherwise>
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.type.stock'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'article'"/>
</xsl:call-template>
<xsl:call-template name="new-metadatum">
<xsl:with-param name="key" select="'dc.type.document'"/>
<xsl:with-param name="language" select="''"/>
<xsl:with-param name="value" select="'32'"/>
</xsl:call-template>
</xsl:otherwise>
</xsl:choose>
</xsl:if>
</xsl:template>
<xsl:template name="new-metadatum">
<xsl:param name="key" />
<xsl:param name="language" />
<xsl:param name="value" />
<xsl:element name="metadatum">
<xsl:element name="key">
<xsl:value-of select="$key" />
</xsl:element>
<xsl:if test="$language != ''">
<xsl:variable name="sanitizedLanguage" select="zoai:sanitizeLanguage($language)" />
<xsl:element name="language">
<xsl:value-of select="$sanitizedLanguage" />
</xsl:element>
</xsl:if>
<xsl:element name="value">
<xsl:value-of select="$value" />
</xsl:element>
</xsl:element>
</xsl:template>
<xsl:function name="zoai:sanitizeLanguage">
<xsl:param name="inputLanguage" />
<xsl:choose>
<xsl:when test="$inputLanguage = 'ger' or $inputLanguage = 'DE' or $inputLanguage = 'de-DE' or $inputLanguage = 'deu'">
<xsl:value-of select="'de'" />
</xsl:when>
<xsl:when test="$inputLanguage = 'eng' or $inputLanguage = 'EN' or $inputLanguage = 'en-US' or $inputLanguage = 'en-GB'">
<xsl:value-of select="'en'" />
</xsl:when>
<xsl:when test="$inputLanguage = 'spa' or $inputLanguage = 'ES' or $inputLanguage = 'es-ES' or $inputLanguage = 'esp'">
<xsl:value-of select="'es'" />
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="$inputLanguage" />
</xsl:otherwise>
</xsl:choose>
</xsl:function>
<xsl:function name="zoai:extractUrnFromFqsAbstract">
<xsl:param name="rawString" />
<!-- <xsl:variable name="rawStringAfterUrnMagicMarker" select="substring-after($rawString, 'urn:nbn:de:')" /> -->
<xsl:analyze-string select="$rawString" regex=".*(urn:nbn:de:.*)$">
<xsl:matching-substring>
<xsl:value-of select="regex-group(1)" />
</xsl:matching-substring>
</xsl:analyze-string>
</xsl:function>
</xsl:stylesheet>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="3.0"
xmlns:cc="http://www.d-nb.de/standards/cc/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dcmitype="http://purl.org/dc/dcmitype/"
xmlns:dcterms="http://purl.org/dc/terms/"
xmlns:ddb="http://www.d-nb.de/standards/ddb/"
xmlns:dini="http://www.d-nb.de/standards/xmetadissplus/type/"
xmlns:doi="http://www.d-nb.de/standards/doi/"
xmlns:hdl="http://www.d-nb.de/standards/hdl/"
xmlns:mml="http://www.w3.org/1998/Math/MathML"
xmlns:nlm="http://dtd.nlm.nih.gov/publishing/2.3"
xmlns:oai="http://www.openarchives.org/OAI/2.0/"
xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
xmlns:pc="http://www.d-nb.de/standards/pc/"
xmlns:thesis="http://www.ndltd.org/standards/metadata/etdms/1.0/"
xmlns:urn="http://www.d-nb.de/standards/urn/"
xmlns:xlink="http://www.w3.org/1999/xlink"
xmlns:xMetaDiss="http://www.d-nb.de/standards/xmetadissplus/"