Commit 97753d08 authored by Gerrit Hübbers's avatar Gerrit Hübbers 🃏
Browse files

Merge branch 'FEATURE-budrich-onix'

parents 46c5f132 e195d5c1
......@@ -70,8 +70,8 @@
<poi.version>3.14</poi.version>
<poi-ooxml.version>3.14</poi-ooxml.version>
<jsoup.version>1.9.2</jsoup.version>
<simplemagic.version>1.8</simplemagic.version>
<commons-validator.version>1.5.1</commons-validator.version>
<simplemagic.version>1.12</simplemagic.version>
<commons-validator.version>1.6</commons-validator.version>
<yoai-service-provider.version>4.5.5</yoai-service-provider.version>
<nv-i18n.version>1.21</nv-i18n.version>
<gson.version>2.8.0</gson.version>
......@@ -475,6 +475,21 @@
<artifactId>zoai</artifactId>
<version>${zoai.version}</version>
</dependency>
<dependency>
<groupId>xalan</groupId>
<artifactId>xalan</artifactId>
<version>2.7.2</version>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
<version>9.8.0-14</version>
</dependency>
<dependency>
<groupId>com.neovisionaries</groupId>
<artifactId>nv-i18n</artifactId>
<version>1.23</version>
</dependency>
<dependency>
<groupId>org.mockftpserver</groupId>
<artifactId>MockFtpServer</artifactId>
......
package org.gesis.dda.publishing.domain;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Stream;
public interface BundlesStreamSource {
Stream<Bundle> getBundlesStream();
String getReference();
Set<Metadatum> getAllMetadata(String scopedIdentifier);
default Set<Metadatum> getAllMetadata(String scopedIdentifier) {
Set<Metadatum> result;
String bundleIdentifier = getReference() + "@@" + scopedIdentifier;
Optional<Bundle> bundleOptional = getBundlesStream().filter( b -> bundleIdentifier.equals( b.getReference() ) ).findFirst();
if ( bundleOptional.isPresent() ) {
result = bundleOptional.get().getMetadata();
}
else {
result = null;
}
return result;
};
}
......@@ -18,6 +18,8 @@ import org.gesis.dda.publishing.domain.impl.UnapiBssConfigDto;
import org.gesis.dda.publishing.domain.impl.UnapiBundlesStreamSource;
import org.gesis.dda.publishing.domain.impl.XmlBundlesSetSource;
import org.gesis.dda.publishing.domain.impl.XmlFileBssConfigDto;
import org.gesis.dda.publishing.domain.impl.XmlXsltBundlesStreamSource;
import org.gesis.dda.publishing.domain.impl.XmlXsltDto;
import org.gesis.dda.publishing.domain.impl.XsltOaiPmhDto;
import org.gesis.dda.publishing.domain.impl.XsltTransformerOaiPmhBundlesStreamSource;
import org.gesis.dda.wizard.domain.BundlesSource;
......@@ -47,7 +49,7 @@ public class BundlesStreamSourceFactory {
}
public BundlesStreamSource getBundlesStreamSource(BundlesSource bundlesSourceEntity) {
return getBundlesStreamSource(bundlesSourceEntity.getType(), bundlesSourceEntity.getData() );
return getBundlesStreamSource(bundlesSourceEntity.getType(), bundlesSourceEntity.getData(), bundlesSourceEntity.getId() );
}
private String constructLoggingString(KmhopbssDto dtoDeserialized) {
......@@ -62,7 +64,7 @@ public class BundlesStreamSourceFactory {
return msg;
}
public BundlesStreamSource getBundlesStreamSource(BundlesSourceType bundlesSourceType, String data) {
public BundlesStreamSource getBundlesStreamSource(BundlesSourceType bundlesSourceType, String data, long bseId) {
BundlesStreamSource result = null;
if (BundlesSourceType.OAI_PMH == bundlesSourceType) {
KmhopbssDto dtoDeserialized = PersistableHelper.instantiate(data, KmhopbssDto.class);
......@@ -116,6 +118,12 @@ public class BundlesStreamSourceFactory {
Map<String, String> map = dto.getMetadataPrefix2XsltMap();
result = new XsltTransformerOaiPmhBundlesStreamSource(url, map, dto.getFrom(), dto.getUntil());
}
else if (BundlesSourceType.XML_XSLT_SOURCE == bundlesSourceType) {
XmlXsltDto dto = PersistableHelper.instantiate(data, XmlXsltDto.class);
String xmlContent = dto.getXmlFile().getContent();
String xsltContent = dto.getXsltFile().getContent();
result = new XmlXsltBundlesStreamSource(xmlContent, xsltContent, bseId);
}
return result;
}
......
......@@ -316,13 +316,16 @@ public class ContentResolver {
private static ContentsUrlsResolvingStrategy getStrategy(String seedUrlString) {
log.debug("seedUrlString is {}", seedUrlString);
ContentsUrlsResolvingStrategy result;
if ( seedUrlString.contains("wbv.de") ) {
result = new WbvResolvingStrategy();
if ( seedUrlString.contains("wbv.de") || seedUrlString.contains("budrich-academic.de") ) {
result = new UseNthPdfOnPageStrategy(0);
}
else if ( seedUrlString.contains("journals.sub.uni-hamburg.de") ) {
else if ( seedUrlString.contains("journals.sub.uni-hamburg.de") || seedUrlString.contains("degruyter.com") ) {
// unfortunately, SUB Uni Hamburg's OJS instances report an incorrect HTML citation_pdf_url
// meta element which points not to the actual PDF bitstream, but the landing page.
// Therefore, we discard all possible (X)HTML bitstream heuristics
// DeGruyter provides in all known cases single chapter PDFs, not full PDFs. Therefore we discard
// results from degruyter.com and rely on alternative PDF sources
result = new EmptySetResolvingStrategy();
}
else {
......
package org.gesis.dda.publishing.domain.impl;
public class FileDto {
private String name;
private String content;
public FileDto(String name, String content) {
this.name = name;
this.content = content;
}
public String getName() {
return name;
}
public String getContent() {
return content;
}
}
\ No newline at end of file
......@@ -17,12 +17,26 @@ import org.slf4j.LoggerFactory;
import com.j256.simplemagic.ContentInfo;
import com.j256.simplemagic.ContentType;
public class WbvResolvingStrategy implements ContentsUrlsResolvingStrategy {
/**
* Strategy to resolve correct PDF from several possible options. If several PDF
* files can be found on an HTML page, then this strategy will collect the nth PDF
* only.
* @author huebbegt
*
*/
public class UseNthPdfOnPageStrategy implements ContentsUrlsResolvingStrategy {
private final static Logger log = LoggerFactory.getLogger(UseNthPdfOnPageStrategy.class);
private int nthPdf;
private int currentPdf = 0;
private int currentAttempt = 0;
private int maximumRetries = 3;
private final static Logger log = LoggerFactory.getLogger(WbvResolvingStrategy.class);
public UseNthPdfOnPageStrategy(int nthPdf) {
this.nthPdf = nthPdf;
}
@Override
public Set<String> getContentsUrls(String seedUrl) {
......@@ -47,9 +61,16 @@ public class WbvResolvingStrategy implements ContentsUrlsResolvingStrategy {
if (null != ci) {
ContentType ct = ci.getContentType();
if ( ContentType.PDF.equals(ct) ) {
contentsUrlStrings.add(resolvedPossiblePdfUrlString);
wasSuccessful = true;
break; // WBV heuristic: first DOM resolvedPossiblePdfUrlString will be correct
if (nthPdf == currentPdf) {
contentsUrlStrings.add(resolvedPossiblePdfUrlString);
wasSuccessful = true;
break;
}
else {
currentPdf++;
continue;
}
}
}
}
......
package org.gesis.dda.transformer;
package org.gesis.dda.publishing.domain.impl;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Stream;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;
import org.gesis.dda.publishing.domain.impl.XmlBundle;
import org.gesis.dda.publishing.domain.Bundle;
import org.gesis.dda.publishing.domain.BundlesStreamSource;
@XmlRootElement(name = "bundles")
public class XmlBundles {
public class XmlBundles implements BundlesStreamSource {
@XmlElement(name = "reference")
private String reference;
@XmlElement(name="bundle")
Set<XmlBundle> bundles;
@Override
public String getReference() {
return reference;
}
public Set<XmlBundle> getBundles() {
return bundles;
}
@Override
public Stream<Bundle> getBundlesStream() {
Set<Bundle> view = new HashSet<>(bundles);
return view.stream();
}
}
package org.gesis.dda.publishing.domain.impl;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.UUID;
import java.util.stream.Stream;
import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Templates;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.apache.xalan.processor.TransformerFactoryImpl;
import org.gesis.dda.publishing.domain.Bundle;
import org.gesis.dda.publishing.domain.BundlesStreamSource;
import org.gesis.dda.xslt.XsltUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class XmlXsltBundlesStreamSource implements BundlesStreamSource {
private final static Logger LOG = LoggerFactory.getLogger(XmlXsltBundlesStreamSource.class);
private String reference = "xmlxslt-uuid-" + UUID.randomUUID().toString();
private String xmlPayload;
private String xslt;
public XmlXsltBundlesStreamSource(String xmlPayload, String xslt) {
this.xmlPayload = xmlPayload;
this.xslt = xslt;
}
public XmlXsltBundlesStreamSource(String xmlPayload, String xslt, long bundlesSourceId) {
this(xmlPayload, xslt);
this.reference = "xmlxslt-bundlessource-" + bundlesSourceId;
}
@Override
public Stream<Bundle> getBundlesStream() {
try {
TransformerFactory factory = TransformerFactory.newInstance("org.apache.xalan.processor.TransformerFactoryImpl", TransformerFactoryImpl.class.getClassLoader() );
Templates xslTemplate = factory.newTemplates( new StreamSource( new StringReader(xslt) ) );
Source xmlInput = new StreamSource( new StringReader(xmlPayload) );
StringWriter writer = new StringWriter();
Result xmlOutput = new StreamResult(writer);
Transformer transformer = xslTemplate.newTransformer();
transformer.setParameter("bundlesSourceReference", reference);
transformer.setParameter("util", new XsltUtil() );
transformer.transform(xmlInput, xmlOutput);
String xsltConvertedXmlOutput = writer.toString();
JAXBContext jaxbContext = JAXBContext.newInstance(XmlBundles.class);
Unmarshaller unmarshaller = jaxbContext.createUnmarshaller();
InputStream convertedXmlInputStream = new ByteArrayInputStream(xsltConvertedXmlOutput.getBytes(StandardCharsets.UTF_8) );
XmlBundles xmlBundles = (XmlBundles) unmarshaller.unmarshal(convertedXmlInputStream);
Stream<Bundle> bundlesStream = xmlBundles.getBundlesStream();
return bundlesStream;
}
catch (Throwable t) {
LOG.error("Problem with XmlXsltBundlesStreamSource.reference=" + reference, t);
throw new RuntimeException(t);
}
}
@Override
public String getReference() {
return reference;
}
}
package org.gesis.dda.publishing.domain.impl;
public class XmlXsltDto {
private FileDto xmlFile;
private FileDto xsltFile;
public XmlXsltDto(FileDto xmlFile, FileDto xsltFile) {
this.xmlFile = xmlFile;
this.xsltFile = xsltFile;
}
public FileDto getXmlFile() {
return xmlFile;
}
public FileDto getXsltFile() {
return xsltFile;
}
}
\ No newline at end of file
......@@ -42,6 +42,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
import net.sf.saxon.TransformerFactoryImpl;
public class XsltTransformerOaiPmhBundlesStreamSource implements BundlesStreamSource {
......@@ -157,7 +158,7 @@ public class XsltTransformerOaiPmhBundlesStreamSource implements BundlesStreamSo
}
else {
// --- STEP 2: convert XML input according to XSLT
TransformerFactory factory = TransformerFactory.newInstance();
TransformerFactory factory = TransformerFactory.newInstance("net.sf.saxon.TransformerFactoryImpl", TransformerFactoryImpl.class.getClassLoader() );
Templates xslTemplate = factory.newTemplates(
new StreamSource(
new StringReader(
......
......@@ -4,5 +4,5 @@ package org.gesis.dda.wizard.domain.enumeration;
* The BundlesSourceType enumeration.
*/
public enum BundlesSourceType {
OAI_PMH, EXCEL_SPREADSHEET, XML_FILE, UNAPI, XSLT_OAI_PMH
OAI_PMH, EXCEL_SPREADSHEET, XML_FILE, UNAPI, XSLT_OAI_PMH, XML_XSLT_SOURCE
}
......@@ -81,6 +81,10 @@ public class BundlesSourceResource {
return ResponseEntity.badRequest().headers(HeaderUtil.createFailureAlert("bundlesSource", "idexists", "A new bundlesSource cannot already have an ID")).body(null);
}
BundlesSource result = bundlesSourceRepository.save(bundlesSource);
if ( result.getType().equals(BundlesSourceType.XML_XSLT_SOURCE) ) {
result.setReference("xmlxslt-bundlessource-" + result.getId() );
result = bundlesSourceRepository.save(result);
}
return ResponseEntity.created(new URI("/api/bundlesSources/" + result.getId()))
.headers(HeaderUtil.createEntityCreationAlert("bundlesSource", result.getId().toString()))
.body(result);
......
package org.gesis.dda.xslt;
import org.gesis.dda.feeder.ssoar.Licence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class XsltUtil {
private final static Logger LOG = LoggerFactory.getLogger(XsltUtil.class);
public static String hyphenizeIsbn(String input) {
String result;
// 978-3-8474-0969-4
result =
input.substring(0, 3) + "-" +
input.substring(3, 4) + "-" +
input.substring(4, 8) + "-" +
input.substring(8, 12) + "-" +
input.substring(12);
return result;
}
public static String getAppropriateSsoarLicenceInternalId(String input) {
String result;
Licence licenceResult = null;
boolean isCreativeCommons = false;
boolean isBy = false;
boolean isSa = false;
boolean isNc = false;
boolean isNd = false;
boolean isPublicDomain = false;
String version = null;
String inputLowerCase = input.toLowerCase();
if ( inputLowerCase.contains("creativecommons") || inputLowerCase.contains("cc") ) {
isCreativeCommons = true;
}
if ( inputLowerCase.contains("by") ) {
isBy = true;
}
if ( inputLowerCase.contains("sa") ) {
isSa = true;
}
if ( inputLowerCase.contains("nc") ) {
isNc = true;
}
if ( inputLowerCase.contains("nd") ) {
isNd = true;
}
if ( inputLowerCase.contains("nd") ) {
isNd = true;
}
if ( inputLowerCase.contains("zero") || inputLowerCase.contains("CC0") ) {
isPublicDomain = true;
}
if ( inputLowerCase.contains("4.0") ) {
version = "4.0";
}
else if ( inputLowerCase.contains("3.0") ) {
version = "3.0";
}
else if ( inputLowerCase.contains("2.0") ) {
version = "2.0";
}
else if ( inputLowerCase.contains("1.0") ) {
version = "1.0";
}
if ( isCreativeCommons && !isPublicDomain && isBy && !isSa && !isNc && !isNd) {
if (null == version) {
licenceResult = Licence.CC_BY;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && !isSa && isNc && isNd) {
if (null == version) {
licenceResult = Licence.CC_BY_NC_ND;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_ND_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_ND_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_ND_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_ND_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && isSa && !isNc && !isNd) {
if (null == version) {
licenceResult = Licence.CC_BY_SA;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_SA_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_SA_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_SA_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_SA_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && !isSa && !isNc && isNd) {
if (null == version) {
licenceResult = Licence.CC_BY_ND;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_ND_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_ND_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_ND_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_ND_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && !isSa && isNc && !isNd) {
if (null == version) {
licenceResult = Licence.CC_BY_NC;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_3_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && isSa && isNc && !isNd) {
licenceResult = Licence.CC_BY_NC_SA;
if (null == version) {
licenceResult = Licence.CC_BY_NC_SA;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_SA_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_SA_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_SA_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_SA_4_0;
}
}
else if ( isCreativeCommons && isPublicDomain ) {
licenceResult = Licence.CC_0;
}
if (null != licenceResult) {
result = licenceResult.getInternalIdentifierLicenceMetadatum().getValue();
}
else {
LOG.warn("Could not identify licence for input={}", input);
result = null;
}
return result;