Commit 1b8fc4f0 authored by Fischer, Tim's avatar Fischer, Tim
Browse files

Rewrote the WZB Bundlefilter (now Econstor Bundlefilter) to harvest old RatSWD documents

parent a2bb2b26
......@@ -14,7 +14,7 @@ import org.gesis.dda.filter.impl.Jfr2SsoarBundleFilter;
import org.gesis.dda.filter.impl.SsoarTargetRepositoryBundleFilter;
import org.gesis.dda.filter.impl.Tatup2SsoarBundleFilter;
import org.gesis.dda.filter.impl.Wbv2SsoarBundleFilter;
import org.gesis.dda.filter.impl.Wzb2SsoarBundleFilter;
import org.gesis.dda.filter.impl.Econstor2SSOARBundleFilter;
import org.gesis.dda.persist.PersistableHelper;
import org.gesis.dda.publishing.domain.BundlesStreamSource;
import org.gesis.dda.publishing.domain.BundlesStreamSourceFactory;
......@@ -143,8 +143,8 @@ public class FeedingContextFactory {
result = new Jfr2SsoarBundleFilter();
break;
case "https://www.econstor.eu/oai/request$$ssoar":
log.info("using Wzb2SsoarBundleFilter");
result = new Wzb2SsoarBundleFilter();
log.info("using Econstor2SSOARBundleFilter");
result = new Econstor2SSOARBundleFilter();
break;
case "https://doabooks.org/oai/$$ssoar":
case "https://doabooks.org/oai$$ssoar":
......
package org.gesis.dda.filter.impl;
import java.util.Set;
import org.gesis.dda.filter.BundleFilter;
import org.gesis.dda.publishing.domain.Bundle;
import org.gesis.dda.publishing.domain.Metadatum;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Econstor2SSOARBundleFilter implements BundleFilter {
private final static Logger LOG = LoggerFactory.getLogger(Econstor2SSOARBundleFilter.class);
/**
* There are some exceptions for harvesting documents from Econstor
* SSOAR already has had documents by Econstor older than 2016
* therefore: harvest everything with yearIssuedValue >= 2016
*
* For the RatSWD Output Series, we already had documents older than 2020
* therefore: harvest everything with yearIssuedValue >= 2020
*
* For the other two RatSWD Series (Research Notes and Working Paper Series) we want every document, but they primarily consist of documents older than 2016
*/
@Override
public boolean test(Bundle bundle) {
boolean harvest = false;
boolean ratSWD = false;
Set<Metadatum> metadata = bundle.getMetadata();
Metadatum series = metadata.stream().filter( m -> m.getKey().equals("dc.source.series") ).findFirst().orElse(null);
Metadatum yearIssued = metadata.stream().filter( m -> m.getKey().equals("dc.date.issued") ).findFirst().orElse(null);
// checking if RatSWD Output Series document should be harvested
if (null != series) {
if(series.getValue().startsWith("RatSWD")) {
ratSWD = true;
if(series.getValue().equals("RatSWD Output Series")) {
if (null != yearIssued) {
String yearIssuedValueString = yearIssued.getValue();
LOG.debug("year issued: {}", yearIssuedValueString);
try {
int yearIssuedValue = Integer.parseInt(yearIssuedValueString);
if ( yearIssuedValue >= 2020 ) {
LOG.info("Econstor2SSOARBundleFilter - '{}' is newer than 2016", yearIssuedValueString);
harvest = true;
}
}
catch (NumberFormatException e) {
LOG.warn("Econstor2SSOARBundleFilter - unparsable dc.date.issued={} for bundle.reference={}", yearIssuedValueString, bundle.getReference() );
}
}
} else {
harvest = true;
}
}
}
// checking if econstor (non-RatSWD) document should be harvested
if(!ratSWD) {
if (null != yearIssued) {
String yearIssuedValueString = yearIssued.getValue();
LOG.debug("year issued: {}", yearIssuedValueString);
try {
int yearIssuedValue = Integer.parseInt(yearIssuedValueString);
if ( yearIssuedValue >= 2016 ) {
LOG.info("Econstor2SSOARBundleFilter - '{}' is newer than 2016", yearIssuedValueString);
harvest = true;
}
}
catch (NumberFormatException e) {
LOG.warn("Econstor2SSOARBundleFilter - unparsable dc.date.issued={} for bundle.reference={}", yearIssuedValueString, bundle.getReference() );
}
}
}
return harvest;
}
}
package org.gesis.dda.filter.impl;
import java.util.Set;
import org.gesis.dda.filter.BundleFilter;
import org.gesis.dda.publishing.domain.Bundle;
import org.gesis.dda.publishing.domain.Metadatum;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class Wzb2SsoarBundleFilter implements BundleFilter {
private final static Logger LOG = LoggerFactory.getLogger(Wzb2SsoarBundleFilter.class);
/**
* SSOAR already has everything up to and including dc.date.issued=2015
* therefore: harvest everything with yearIssuedValue >= 2016
*/
@Override
public boolean test(Bundle bundle) {
boolean result;
Set<Metadatum> metadata = bundle.getMetadata();
Metadatum yearIssued = metadata.stream().filter( m -> m.getKey().equals("dc.date.issued") ).findFirst().orElse(null);
if (null != yearIssued) {
String yearIssuedValueString = yearIssued.getValue();
LOG.debug("year issued: {}", yearIssuedValueString);
try {
int yearIssuedValue = Integer.parseInt(yearIssuedValueString);
if ( yearIssuedValue >= 2016 ) {
LOG.info("'{}' is newer than 2016", yearIssuedValueString);
result = true;
}
else {
LOG.info("WzbSsoarBundleFilter - filtering away bundle.reference={} , wrong year issued", bundle.getReference() );
result = false;
}
}
catch (NumberFormatException e) {
LOG.warn("unparsable dc.date.issued={} for bundle.reference={}", yearIssuedValueString, bundle.getReference() );
result = true;
}
}
else {
result = true;
}
return result;
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment