Commit 027026e5 authored by Gerrit Hübbers's avatar Gerrit Hübbers 🃏
Browse files

Merge branch 'FEATURE-bibtex-citavi-adapter'

parents c75de39d 029a9fef
# All text files should have the "lf" (Unix) line endings
* text=false
# Explicitly declare text files you want to always be normalized and converted
# to native line endings on checkout.
*.java text
*.js text
*.css text
*.html text
# Denote all files that are truly binary and should not be modified.
*.png binary
*.jpg binary
*.gif binary
*.jar binary
*.pdf binary
*.eot binary
*.ttf binary
*.woff binary
*.woff2 binary
*.oft binary
*.gzip binary
*.gz binary
*.ai binary
*.eps binary
*.swf binary
*.xls binary
*.xlsx binary
*.webm binary
......@@ -490,6 +490,11 @@
<artifactId>nv-i18n</artifactId>
<version>1.23</version>
</dependency>
<dependency>
<groupId>org.jbibtex</groupId>
<artifactId>jbibtex</artifactId>
<version>1.0.17</version>
</dependency>
<dependency>
<groupId>org.mockftpserver</groupId>
<artifactId>MockFtpServer</artifactId>
......
......@@ -2,8 +2,11 @@ package org.gesis.dda.feeder.ssoar;
import org.gesis.dda.publishing.domain.Metadatum;
import org.gesis.dda.publishing.domain.impl.SimpleMetadatum;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public enum Licence {
CC_BY("1", "Creative Commons - Namensnennung", "Creative Commons - Attribution"),
CC_BY_NC_ND("2", "Creative Commons - Namensnennung, Nicht kommerz., Keine Bearbeitung", "Creative Commons - Attribution, Noncommercial, No Derivate Works"),
DEPOSIT("3", "Deposit Licence - Keine Weiterverbreitung, keine Bearbeitung", "Deposit Licence - No Redistribution, No Modifications"),
......@@ -41,6 +44,10 @@ public enum Licence {
CC_BY_NC_SA_3_0("35", "Creative Commons - Namensnennung, Nicht-kommerz., Weitergabe unter gleichen Bedingungen 3.0", "Creative Commons - Attribution-NonCommercial-ShareAlike 3.0"),
CC_BY_NC_SA_4_0("36", "Creative Commons - Namensnennung, Nicht-kommerz., Weitergabe unter gleichen Bedingungen 4.0", "Creative Commons - Attribution-NonCommercial-ShareAlike 4.0");
private final static Logger LOG = LoggerFactory.getLogger(Licence.class);
private final Metadatum deMetadatum;
private final Metadatum enMetadatum;
private final Metadatum internalMetadatum;
......@@ -68,4 +75,182 @@ public enum Licence {
public Metadatum getLegacyMetadatum() {
return legacyInternalMetadatum;
}
public static Metadatum getLegacyLicenceMetadatum(String input) {
Metadatum result;
Licence licenceResult = null;
boolean isCreativeCommons = false;
boolean isBy = false;
boolean isSa = false;
boolean isNc = false;
boolean isNd = false;
boolean isPublicDomain = false;
String version = null;
String inputLowerCase = input.toLowerCase();
if ( inputLowerCase.contains("creativecommons") || inputLowerCase.contains("cc") ) {
isCreativeCommons = true;
}
if ( inputLowerCase.contains("by") ) {
isBy = true;
}
if ( inputLowerCase.contains("sa") ) {
isSa = true;
}
if ( inputLowerCase.contains("nc") ) {
isNc = true;
}
if ( inputLowerCase.contains("nd") ) {
isNd = true;
}
if ( inputLowerCase.contains("nd") ) {
isNd = true;
}
if ( inputLowerCase.contains("zero") || inputLowerCase.contains("CC0") ) {
isPublicDomain = true;
}
if ( inputLowerCase.contains("4.0") ) {
version = "4.0";
}
else if ( inputLowerCase.contains("3.0") ) {
version = "3.0";
}
else if ( inputLowerCase.contains("2.0") ) {
version = "2.0";
}
else if ( inputLowerCase.contains("1.0") ) {
version = "1.0";
}
if ( isCreativeCommons && !isPublicDomain && isBy && !isSa && !isNc && !isNd) {
if (null == version) {
licenceResult = Licence.CC_BY;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && !isSa && isNc && isNd) {
if (null == version) {
licenceResult = Licence.CC_BY_NC_ND;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_ND_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_ND_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_ND_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_ND_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && isSa && !isNc && !isNd) {
if (null == version) {
licenceResult = Licence.CC_BY_SA;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_SA_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_SA_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_SA_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_SA_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && !isSa && !isNc && isNd) {
if (null == version) {
licenceResult = Licence.CC_BY_ND;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_ND_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_ND_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_ND_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_ND_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && !isSa && isNc && !isNd) {
if (null == version) {
licenceResult = Licence.CC_BY_NC;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_3_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_4_0;
}
}
else if ( isCreativeCommons && !isPublicDomain && isBy && isSa && isNc && !isNd) {
licenceResult = Licence.CC_BY_NC_SA;
if (null == version) {
licenceResult = Licence.CC_BY_NC_SA;
}
else if ( "1.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_SA_1_0;
}
else if ( "2.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_SA_2_0;
}
else if ( "3.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_SA_3_0;
}
else if ( "4.0".equals(version) ) {
licenceResult = Licence.CC_BY_NC_SA_4_0;
}
}
else if ( isCreativeCommons && isPublicDomain ) {
licenceResult = Licence.CC_0;
}
else if ( inputLowerCase.contains("deposit") ) {
licenceResult = Licence.DEPOSIT;
}
if (null != licenceResult) {
result = licenceResult.getLegacyMetadatum();
}
else {
LOG.warn("Could not identify licence for input={}", input);
result = null;
}
return result;
}
}
......@@ -37,4 +37,34 @@ public enum PublicationStatus {
public Metadatum getLegacyMetadatum() {
return legacyInternalMetadatum;
}
public static Metadatum getLegacyPublicationStatusMetadatum(String input) {
Metadatum result;
if (null != input) {
if ( input.contains("Veröffentlichungsversion") || input.contains("Published Version") || input.equals("1") ) {
result = PublicationStatus.PUBLISHED_VERSION.getLegacyMetadatum();
}
else if (input.contains("Postprint") || input.equals("2") ) {
result = PublicationStatus.POSTPRINT.getLegacyMetadatum();
}
else if (input.contains("Preprint") || input.equals("3") ) {
result = PublicationStatus.PREPRINT.getLegacyMetadatum();
}
else if (input.contains("unbekannt") || input.contains("unknown") || input.equals("4") ) {
result = PublicationStatus.UNKNOWN.getLegacyMetadatum();
}
else if (input.contains("Erstveröffentlichung") || input.contains("Primary Publication") || input.equals("5") ) {
result = PublicationStatus.PRIMARY_PUBLICATION.getLegacyMetadatum();
}
else {
result = null;
}
}
else {
result = null;
}
return result;
}
}
......@@ -36,4 +36,31 @@ public enum ReviewStatus {
public Metadatum getLegacyMetadatum() {
return legacyInternalMetadatum;
}
public static Metadatum getLegacyReviewStatusMetadatum(String input) {
Metadatum result;
if (null != input) {
if ( input.contains("peer") || input.equals("1") ) {
result = ReviewStatus.PEER_REVIEWED.getLegacyMetadatum();
}
else if (input.contains("begutachtet") || input.contains("reviewed") || input.equals("2") ) {
result = ReviewStatus.REVIEWED.getLegacyMetadatum();
}
else if (input.contains("nicht") || input.contains("not") || input.equals("3") ) {
result = ReviewStatus.NOT_REVIEWED.getLegacyMetadatum();
}
else if (input.contains("unbekannt") || input.contains("unknown") || input.equals("4") ) {
result = ReviewStatus.UNKNOWN.getLegacyMetadatum();
}
else {
result = null;
}
}
else {
result = null;
}
return result;
}
}
......@@ -11,6 +11,8 @@ import java.util.Map;
import javax.inject.Inject;
import org.gesis.dda.persist.PersistableHelper;
import org.gesis.dda.publishing.domain.impl.BibtexBundlesStreamSource;
import org.gesis.dda.publishing.domain.impl.BibtexDto;
import org.gesis.dda.publishing.domain.impl.ExcelSpreadsheetBundlesSetSource;
import org.gesis.dda.publishing.domain.impl.KmhopbssDto;
import org.gesis.dda.publishing.domain.impl.KnownMetadataPrefixesOaiPmhBundlesStreamSource;
......@@ -124,6 +126,12 @@ public class BundlesStreamSourceFactory {
String xsltContent = dto.getXsltFile().getContent();
result = new XmlXsltBundlesStreamSource(xmlContent, xsltContent, bseId);
}
else if (BundlesSourceType.BIBTEX_SOURCE == bundlesSourceType) {
BibtexDto dto = PersistableHelper.instantiate(data, BibtexDto.class);
String bibtexContent = dto.getBibtexFile().getContent();
result = new BibtexBundlesStreamSource(bibtexContent, bseId);
}
return result;
}
......
package org.gesis.dda.publishing.domain.impl;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.gesis.dda.feeder.ssoar.Licence;
import org.gesis.dda.feeder.ssoar.PublicationStatus;
import org.gesis.dda.feeder.ssoar.ReviewStatus;
import org.gesis.dda.feeder.ssoar.Stock;
import org.gesis.dda.publishing.domain.Bundle;
import org.gesis.dda.publishing.domain.BundlesStreamSource;
import org.gesis.dda.publishing.domain.Metadatum;
import org.jbibtex.BibTeXDatabase;
import org.jbibtex.BibTeXEntry;
import org.jbibtex.BibTeXParser;
import org.jbibtex.Key;
import org.jbibtex.LaTeXObject;
import org.jbibtex.LaTeXParser;
import org.jbibtex.LaTeXPrinter;
import org.jbibtex.ParseException;
import org.jbibtex.TokenMgrException;
import org.jbibtex.Value;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class BibtexBundlesStreamSource implements BundlesStreamSource {
private final static Key INSTITUTION_KEY = new Key("institution");
private final static Key BOOK_INSTITUTION_KEY = new Key("bookinstitution");
private final static Key REVIEWSTATUS_KEY = new Key("reviewstatus");
private final static Key BOOK_REVIEWSTATUS_KEY = new Key("bookreviewstatus");
private final static Key LICENCE_KEY = new Key("licence");
private final static Key BOOK_LICENCE_KEY = new Key("booklicence");
private final static Key PUBLICATIONSTATUS_KEY = new Key("publicationstatus");
private final static Key BOOK_PUBLICATIONSTATUS_KEY = new Key("bookpublicationstatus");
private final static Key LANGUAGE_KEY = new Key("language");
private final static Key BOOK_LANGUAGE_KEY = new Key("booklanguage");
private final static Key EMBARGODATE_KEY = new Key("embargodate");
private final static Key BOOK_EMBARGODATE_KEY = new Key("bookembargodate");
private final static Key KEYWORDS_KEY = new Key("keywords");
private final static Key BOOK_KEYWORDS_KEY = new Key("bookkeywords");
private final static Logger LOG = LoggerFactory.getLogger(BibtexBundlesStreamSource.class);
private LaTeXParser latexParser;
private LaTeXPrinter latexPrinter = new LaTeXPrinter();
private String reference = "bibtex-uuid" + UUID.randomUUID().toString();
private String bibtexPayload;
public BibtexBundlesStreamSource(String bibtexPayload) {
this.bibtexPayload = bibtexPayload;
try {
latexParser = new LaTeXParser();
}
catch (ParseException e) {
LOG.error("Problem initializing LaTeXParser", e);
throw new RuntimeException(e);
}
}
public BibtexBundlesStreamSource(String bibtexPayload, long bundlesSourceId) {
this(bibtexPayload);
this.reference = "bibtex-bundlessource-" + bundlesSourceId;
}
private String convertLatexValueToNormalString(Value input) {
String result;
String rawFieldValue = input.toUserString();
// remove newlines, as they make problems with LaTeX parsing
String cleanRawFieldValue = rawFieldValue.replace("\r", "").replace("\n", " ");
List<LaTeXObject> latexObjects;
try {
latexObjects = latexParser.parse(cleanRawFieldValue);
}
catch (TokenMgrException | ParseException e1) {
LOG.error("Problem parsing userString=" + cleanRawFieldValue, e1);
throw new RuntimeException(e1);
}
result = latexPrinter.print(latexObjects);
return result;
}
@Override
public Stream<Bundle> getBundlesStream() {
Stream<Bundle> result;
try {
BibTeXParser bibtexParser = new BibTeXParser();
BibTeXDatabase database;
try ( Reader reader = new StringReader(bibtexPayload) ) {
database = bibtexParser.parseFully(reader);
}
Map<Key, BibTeXEntry> entries = database.getEntries();
List<Bundle> bundles = new ArrayList<>();
entries.forEach( (k, e) -> {
Set<Metadatum> metadata = new HashSet<>();
String localReference = k.getValue();
String fullReference = reference + "@@" + localReference;
metadata.add( new SimpleMetadatum("internal.dda.reference", fullReference) );
Key entryType = e.getType();
String entryTypeString = entryType.getValue();
Metadatum stock;
switch (entryTypeString) {
case "collection" : stock = Stock.COLLECTION.getMetadatum(); break;
case "article" : stock = Stock.ARTICLE_OR_JOURNAL_ARTICLE.getMetadatum(); break;
case "book" : stock = Stock.MONOGRAPH.getMetadatum(); break;
case "incollection" : stock = Stock.INCOLLECTION.getMetadatum(); break;
default : stock = null;
}
if (null != stock) {
metadata.add(stock);
}
Map<Key, Value> entryFields = e.getFields();
entryFields.forEach( (fieldType, fieldValue) -> {
String cleanFieldValue = convertLatexValueToNormalString(fieldValue);
List<Metadatum> fieldTypeMetadata;
String fieldTypeString = fieldType.getValue();
switch (fieldTypeString) {
case "abstract" : fieldTypeMetadata = getSplittedMetadata("dc.description.abstract", "@@", cleanFieldValue); break;
case "author" : fieldTypeMetadata = getSplittedMetadata("dc.contributor.author", "; ", cleanFieldValue); break;
case "editor" : fieldTypeMetadata = getSplittedMetadata("dc.contributor.editor", "; ", cleanFieldValue); break;
case "year" : fieldTypeMetadata = getMetadatum("dc.date.issued", cleanFieldValue); break;
case "title" : fieldTypeMetadata = getMetadatum("dc.title", cleanFieldValue); break;
case "url" : fieldTypeMetadata = getMetadatum("dc.identifier.uri", cleanFieldValue); break;
case "keywords" : fieldTypeMetadata = getSplittedMetadata("dc.subject.other", "; ", cleanFieldValue); break;
case "pages" : fieldTypeMetadata = getPageMetadatum(cleanFieldValue); break;
case "volume" : fieldTypeMetadata = getMetadatum("dc.source.volume", cleanFieldValue); break;
case "number" : fieldTypeMetadata = getMetadatum("dc.source.issue", cleanFieldValue); break;
case "issn" : fieldTypeMetadata = getMetadatum("dc.identifier.issn", cleanFieldValue); break;
case "journal" : fieldTypeMetadata = getMetadatum("dc.source.journal", cleanFieldValue); break;
case "doi" : fieldTypeMetadata = getMetadatum("dc.identifier.doi", cleanFieldValue); break;
case "urn" : fieldTypeMetadata = getMetadatum("dc.identifier.urn", cleanFieldValue); break;
case "reviewstatus" : fieldTypeMetadata = getReviewStatusMetadatum(cleanFieldValue); break;
case "embargodate" : fieldTypeMetadata = getMetadatum("internal.embargo.liftdate", cleanFieldValue); break;
case "licence" : fieldTypeMetadata = getLicenceMetadatum(cleanFieldValue); break;
case "publicationstatus" : fieldTypeMetadata = getPublicationStatusMetadatum(cleanFieldValue); break;
case "language" : fieldTypeMetadata = getMetadatum("dc.language", cleanFieldValue); break;
case "titleinotherlanguage" : fieldTypeMetadata = getMetadatum("dc.title.alternative", cleanFieldValue); break;
case "translatedtitle" : fieldTypeMetadata = getMetadatum("dc.title.alternative", cleanFieldValue); break;
case "address" : fieldTypeMetadata = getMetadatum("dc.publisher.city", cleanFieldValue); break;
case "institution" : fieldTypeMetadata = getMetadatum("dc.contributor.corporateeditor", cleanFieldValue); break;
case "series" : fieldTypeMetadata = getMetadatum("dc.source.series", cleanFieldValue); break;
case "isbn" : fieldTypeMetadata = getMetadatum("dc.identifier.isbn", cleanFieldValue); break;
case "publisher" : fieldTypeMetadata = getMetadatum("dc.publisher", cleanFieldValue); break;
default : fieldTypeMetadata = null;
}
if (fieldTypeMetadata != null && 0 != fieldTypeMetadata.size() ) {
metadata.addAll(fieldTypeMetadata);
}
});
// if the following keys don't exist for an incollection, but are available for the containing collection, use that collection's key entries
// also, don't use bookurn, bookurl, bookdoi, as this identifiers are used for publication identification
if (!entryFields.containsKey(REVIEWSTATUS_KEY) && entryFields.containsKey(BOOK_REVIEWSTATUS_KEY) ) {
Value value = entryFields.get(BOOK_REVIEWSTATUS_KEY);
String cleaned =convertLatexValueToNormalString(value);
List<Metadatum> bookMetadata = getReviewStatusMetadatum(cleaned);
metadata.addAll(bookMetadata);
}
if (!entryFields.containsKey(INSTITUTION_KEY) && entryFields.containsKey(BOOK_INSTITUTION_KEY) ) {
Value value = entryFields.get(BOOK_INSTITUTION_KEY);
String cleaned =convertLatexValueToNormalString(value);
List<Metadatum> bookMetadata = getMetadatum("dc.contributor.corporateeditor", cleaned);
metadata.addAll(bookMetadata);
}
if (!entryFields.containsKey(LICENCE_KEY) && entryFields.containsKey(BOOK_LICENCE_KEY) ) {
Value value = entryFields.get(BOOK_LICENCE_KEY);
String cleaned =convertLatexValueToNormalString(value);
List<Metadatum> bookMetadata = getLicenceMetadatum(cleaned);
metadata.addAll(bookMetadata);
}
if (!entryFields.containsKey(PUBLICATIONSTATUS_KEY) && entryFields.containsKey(BOOK_PUBLICATIONSTATUS_KEY) ) {
Value value = entryFields.get(BOOK_PUBLICATIONSTATUS_KEY);
String cleaned =convertLatexValueToNormalString(value);
List<Metadatum> bookMetadata = getPublicationStatusMetadatum(cleaned);
metadata.addAll(bookMetadata);
}
if (!entryFields.containsKey(LANGUAGE_KEY) && entryFields.containsKey(BOOK_LANGUAGE_KEY) ) {
Value value = entryFields.get(BOOK_LANGUAGE_KEY);
String cleaned =convertLatexValueToNormalString(value);
List<Metadatum> bookMetadata = getMetadatum("dc.language", cleaned);
metadata.addAll(bookMetadata);
}
if (!entryFields.containsKey(EMBARGODATE_KEY) && entryFields.containsKey(BOOK_EMBARGODATE_KEY) ) {
Value value = entryFields.get(BOOK_EMBARGODATE_KEY);
String cleaned =convertLatexValueToNormalString(value);
List<Metadatum> bookMetadata = getMetadatum("internal.embargo.liftdate", cleaned);
metadata.addAll(bookMetadata);
}
if (!entryFields.containsKey(KEYWORDS_KEY) && entryFields.containsKey(BOOK_KEYWORDS_KEY) ) {
Value value = entryFields.get(BOOK_KEYWORDS_KEY);
String cleaned =convertLatexValueToNormalString(value);
List<Metadatum> bookMetadata = getSplittedMetadata("dc.subject.other", "; ", cleaned);
metadata.addAll(bookMetadata);
}
bundles.add(new AutonomouslyContentResolvingBundle(metadata) );
});
result = bundles.stream();
return result;
}
catch (IOException | TokenMgrException | ParseException e) {
LOG.error("Problem", e);
throw new RuntimeException(e);
}