Commit 0561d6bb authored by Gerrit Hübbers's avatar Gerrit Hübbers 🃏
Browse files

WIP: Introduce BibtexBundlesStreamSource

parent b34ddd9f
package org.gesis.dda.publishing.domain.impl;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.gesis.dda.feeder.ssoar.Stock;
import org.gesis.dda.publishing.domain.Bundle;
import org.gesis.dda.publishing.domain.BundlesStreamSource;
import org.gesis.dda.publishing.domain.Metadatum;
import org.gesis.dda.xslt.TransformUtil;
import org.gesis.dda.xslt.XsltUtil;
import org.jbibtex.BibTeXDatabase;
import org.jbibtex.BibTeXEntry;
import org.jbibtex.BibTeXParser;
import org.jbibtex.Key;
import org.jbibtex.LaTeXObject;
import org.jbibtex.LaTeXParser;
import org.jbibtex.LaTeXPrinter;
import org.jbibtex.ParseException;
import org.jbibtex.TokenMgrException;
import org.jbibtex.Value;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class BibtexBundlesStreamSource implements BundlesStreamSource {
private final static Logger LOG = LoggerFactory.getLogger(BibtexBundlesStreamSource.class);
private String reference = "bibtex-uuid" + UUID.randomUUID().toString();
private String bibtexPayload;
public BibtexBundlesStreamSource(String bibtexPayload) {
this.bibtexPayload = bibtexPayload;
}
public BibtexBundlesStreamSource(String bibtexPayload, long bundlesSourceId) {
this(bibtexPayload);
this.reference = "bibtex-bundlessource-" + bundlesSourceId;
}
@Override
public Stream<Bundle> getBundlesStream() {
Stream<Bundle> result;
try {
BibTeXParser bibtexParser = new BibTeXParser();
LaTeXParser latexParser = new LaTeXParser();
LaTeXPrinter latexPrinter = new LaTeXPrinter();
BibTeXDatabase database;
try ( Reader reader = new StringReader(bibtexPayload) ) {
database = bibtexParser.parseFully(reader);
}
Map<Key, BibTeXEntry> entries = database.getEntries();
List<Bundle> bundles = new ArrayList<>();
entries.forEach( (k, e) -> {
Set<Metadatum> metadata = new HashSet<>();
String localReference = k.getValue();
String fullReference = reference + "@@" + localReference;
metadata.add( new SimpleMetadatum("internal.dda.reference", fullReference) );
Key entryType = e.getType();
String entryTypeString = entryType.getValue();
Metadatum stock;
switch (entryTypeString) {
case "collection" : stock = Stock.COLLECTION.getMetadatum(); break;
case "article" : stock = Stock.ARTICLE_OR_JOURNAL_ARTICLE.getMetadatum(); break;
case "book" : stock = Stock.MONOGRAPH.getMetadatum(); break;
case "incollection" : stock = Stock.INCOLLECTION.getMetadatum(); break;
default : stock = null;
}
if (null != stock) {
metadata.add(stock);
}
Map<Key, Value> entryFields = e.getFields();
entryFields.forEach( (fieldType, fieldValue) -> {
String fieldTypeString = fieldType.getValue();
String rawFieldValue = fieldValue.toUserString();
// remove newlines, as they make problems with LaTeX parsing
String cleanRawFieldValue = rawFieldValue.replace("\r", "").replace("\n", " ");
List<LaTeXObject> latexObjects;
try {
latexObjects = latexParser.parse(cleanRawFieldValue);
}
catch (TokenMgrException | ParseException e1) {
LOG.error("Problem parsing userString=" + cleanRawFieldValue, e1);
throw new RuntimeException(e1);
}
String cleanFieldValue = latexPrinter.print(latexObjects);
List<Metadatum> fieldTypeMetadata;
switch (fieldTypeString) {
case "abstract" : fieldTypeMetadata = getSplittedMetadata("dc.description.abstract", "@@", cleanFieldValue); break;
case "author" : fieldTypeMetadata = getSplittedMetadata("dc.contributor.author", "; ", cleanFieldValue); break;
case "editor" : fieldTypeMetadata = getSplittedMetadata("dc.contributor.editor", "; ", cleanFieldValue); break;
case "year" : fieldTypeMetadata = getMetadatum("dc.date.issued", cleanFieldValue); break;
case "title" : fieldTypeMetadata = getMetadatum("dc.title", cleanFieldValue); break;
case "url" : fieldTypeMetadata = getMetadatum("dc.identifier.uri", cleanFieldValue); break;
case "keywords" : fieldTypeMetadata = getSplittedMetadata("dc.subject.other", "; ", cleanFieldValue); break;
case "pages" : fieldTypeMetadata = getPageMetadatum(cleanFieldValue); break;
case "volume" : fieldTypeMetadata = getMetadatum("dc.source.volume", cleanFieldValue); break;
case "number" : fieldTypeMetadata = getMetadatum("dc.source.issue", cleanFieldValue); break;
case "issn" : fieldTypeMetadata = getMetadatum("dc.source.issn", cleanFieldValue); break;
case "journal" : fieldTypeMetadata = getMetadatum("dc.source.journal", cleanFieldValue); break;
case "doi" : fieldTypeMetadata = getMetadatum("dc.identifier.doi", cleanFieldValue); break;
case "urn" : fieldTypeMetadata = getMetadatum("dc.identifier.urn", cleanFieldValue); break;
case "reviewstatus" : fieldTypeMetadata = getReviewStatusMetadatum(cleanFieldValue); break;
case "embargodate" : fieldTypeMetadata = getMetadatum("internal.embargo.liftdate", cleanFieldValue); break;
case "licence" : fieldTypeMetadata = getLicenceMetadatum(cleanFieldValue); break;
case "publicationstatus" : fieldTypeMetadata = getPublicationStatusMetadatum(cleanFieldValue); break;
case "language" : fieldTypeMetadata = getMetadatum("dc.language", cleanFieldValue); break;
case "titleinotherlanguage" : fieldTypeMetadata = getMetadatum("dc.title.alternative", cleanFieldValue); break;
case "translatedtitle" : fieldTypeMetadata = getMetadatum("dc.title.alternative", cleanFieldValue); break;
case "address" : fieldTypeMetadata = getMetadatum("dc.publisher.city", cleanFieldValue); break;
case "institution" : fieldTypeMetadata = getMetadatum("dc.contributor.corporateeditor", cleanFieldValue); break;
case "series" : fieldTypeMetadata = getMetadatum("dc.source.series", cleanFieldValue); break;
case "isbn" : fieldTypeMetadata = getMetadatum("dc.identifier.isbn", cleanFieldValue); break;
default : fieldTypeMetadata = null;
}
if (fieldTypeMetadata != null && 0 != fieldTypeMetadata.size() ) {
metadata.addAll(fieldTypeMetadata);
}
});
if (!entryFields.containsKey( new Key("reviewstatus")) && entryFields.containsKey( new Key("bookreviewstatus")) ) {
}
bundles.add(new AutonomouslyContentResolvingBundle(metadata) );
});
result = bundles.stream();
return result;
}
catch (IOException | TokenMgrException | ParseException e) {
LOG.error("Problem", e);
throw new RuntimeException(e);
}
}
private List<Metadatum> getPublicationStatusMetadatum(String cleanFieldValue) {
List<Metadatum> result = new ArrayList<>();
String internalValue = TransformUtil.getSsoarPublicationStatusId(cleanFieldValue);
if (null != internalValue) {
result.add(new SimpleMetadatum("internal.identifier.pubstatus", internalValue) );
}
return result;
}
private List<Metadatum> getLicenceMetadatum(String cleanFieldValue) {
List<Metadatum> result = new ArrayList<>();
String internalValue = XsltUtil.getAppropriateSsoarLicenceInternalId(cleanFieldValue);
if (null != internalValue) {
result.add(new SimpleMetadatum("internal.identifier.licence", internalValue) );
}
return result;
}
private List<Metadatum> getReviewStatusMetadatum(String cleanFieldValue) {
List<Metadatum> result = new ArrayList<>();
String internalValue = TransformUtil.getSsoarReviewStatusId(cleanFieldValue);
if (null != internalValue) {
result.add(new SimpleMetadatum("internal.identifier.review", internalValue) );
}
return result;
}
private List<Metadatum> getPageMetadatum(String cleanFieldValue) {
List<Metadatum> result = new ArrayList<>();
if (null != cleanFieldValue && !cleanFieldValue.isEmpty() ) {
// replace en-dash with minus
String fixedCleanFieldValue = cleanFieldValue.replace("–", "-");
Metadatum metadatum = new SimpleMetadatum("dc.source.pageinfo", fixedCleanFieldValue);
result.add(metadatum);
}
return result;
}
@Override
public String getReference() {
return reference;
}
private List<Metadatum> getMetadatum(String metadatumKey, String cleanFieldValue) {
List<Metadatum> result = new ArrayList<>();
if (null != cleanFieldValue && !cleanFieldValue.isEmpty() ) {
Metadatum metadatum = new SimpleMetadatum(metadatumKey, cleanFieldValue);
result.add(metadatum);
}
return result;
}
private List<Metadatum> getSplittedMetadata(String metadatumKey, String separator, String cleanFieldValue) {
List<Metadatum> result;
List<String> personStrings = Arrays.asList( cleanFieldValue.split(separator) );
result = personStrings.stream().map(String::trim).filter(s -> !s.isEmpty() ).map(s -> new SimpleMetadatum(metadatumKey, s) ).collect( Collectors.toList() );
return result;
}
}
......@@ -37,8 +37,6 @@ public class BibtexTest {
BibTeXDatabase database;
try (InputStream is = getClass().getResourceAsStream("/bibtex/citavi-ssoar-convention-bibtex-export.bib") ) {
// try (InputStream is = getClass().getResourceAsStream("/bibtex/citavi-ssoar-convention-bibtex-export-fixed.bib") ) {
// try (InputStream is = getClass().getResourceAsStream("/bibtex/citavi-ssoar-convention-bibtex-export-utf8.bib") ) {
try ( Reader reader = new InputStreamReader(is, StandardCharsets.UTF_8) ) {
database = bibtexParser.parseFully(reader);
}
......
package org.gesis.dda.publishing.domain.impl;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.gesis.dda.feeder.ssoar.Licence;
import org.gesis.dda.feeder.ssoar.PublicationStatus;
import org.gesis.dda.feeder.ssoar.ReviewStatus;
import org.gesis.dda.feeder.ssoar.Stock;
import org.gesis.dda.publishing.domain.Bundle;
import org.gesis.dda.publishing.domain.BundlesStreamSource;
import org.gesis.dda.publishing.domain.Metadatum;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class BibtexBundlesStreamSourceTest {
private final static Logger LOG = LoggerFactory.getLogger(BibtexBundlesStreamSourceTest.class);
@Test
public void testCitaviBibtexExport() throws IOException {
String bibtexString = IOUtils.toString(
getClass().
getResourceAsStream("/bibtex/citavi-ssoar-convention-bibtex-export.bib"), StandardCharsets.UTF_8);
LOG.info("bibtexString=\n{}", bibtexString);
BundlesStreamSource bss = new BibtexBundlesStreamSource(bibtexString, 4711);
bss.getBundlesStream().map(Bundle::getMetadata).map(Object::toString).forEach(LOG::info);
assertEquals(9, bss.getBundlesStream().count());
{
Set<Metadatum> metadata = bss.getAllMetadata("Blohm.2013");
metadata.stream().map(Object::toString).forEach(LOG::info);
assertTrue(metadata.contains( new SimpleMetadatum("internal.dda.reference", "bibtex-bundlessource-4711@@Blohm.2013") ) );
assertTrue(metadata.contains( Stock.ARTICLE_OR_JOURNAL_ARTICLE.getMetadatum() ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.description.abstract", "Nonresponse is an ongoing challenge for survey research. In the German General Social Survey (ALLBUS) 2010, an experiment was set up to test the effect of respondent incentives on outcome rates, sam¬ple composition and fieldwork efforts. A random subsample of target persons was offered a monetary incentive of €10 to be paid upon completion of the interview. The other part of the sample acted as a control group receiving no incentive. The incentive used in ALLBUS 2010 led to an increase in the response rate, mainly by improving the cooperation rate. It did not change the sample composition in a major way. Concerning fieldwork efforts, a slight reduction was observed: In the incentive condition, a given number of interviews was achieved with a lower number of contact attempts than in the no incentive condition. (author’s abstract)") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.description.abstract", "Nonresponse ist eine Herausforderung für die gesamte Umfrageforschung. In der Allgemeinen Bevölkerungsumfrage der Sozialwissenschaften (ALLBUS) wurde im Jahr 2010 ein Experiment durchgeführt, um die Effekte von Befragten-Incentives auf die Ausschöpfungsquote, die Stichprobenzusammensetzung und den in der Feldarbeit notwendigen Aufwand zu untersuchen. Einer Zufallsstichprobe der Zielpersonen wurde für die Teilnahme an der Umfrage ein monetäres Incentive in Höhe von 10 angeboten. Der verbleibende Teil der Zielpersonen fungierte als Kontrollgruppe und erhielt kein Incentive. Das verwendete Incentive führte zu einer höheren Kooperationsrate und einer höheren Ausschöpfung in der Experimentalgruppe. Die Stichprobenzusammensetzung unterschied sich nicht wesentlich zwischen Experimental- und Kontrollgruppe. In der Experimentalgruppe konnten die Interviewer eine gegebene Zahl von Interviews mit weniger Kontaktversuchen erzielen als in der Kontrollgruppe. (Autorenreferat)") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.author", "Blohm, Michael") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.author", "Koch, Achim") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.date.issued", "2013") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.title", "Respondent incentives in a national face-to-face survey: effects on outcome rates, sample composition and fieldwork efforts") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.identifier.uri", "https://mda.gesis.org/index.php/mda/article/view/2013.004") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "ALLBUS") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Anreizsystem" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Antwortverhalten" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Befragung" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Erhebungstechniken und Analysetechniken der Sozialwissenschaften" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Experiment" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Feldforschung" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Geld" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Interview" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Kooperation" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.subject.other", "Stichprobe" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.source.pageinfo", "89-122" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.source.volume", "7" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.source.issue", "1" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.source.issn", "2190-4936" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.source.journal", "Methods, data, analyses : a journal for quantitative methods and survey methodology (mda)" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.identifier.doi", "10.12758/mda.2013.004" ) ) );
assertTrue(metadata.contains( ReviewStatus.PEER_REVIEWED.getInternalIdentifierReviewMetadatum() ) );
assertTrue(metadata.contains( new SimpleMetadatum("internal.embargo.liftdate", "2006-03-01" ) ) );
assertTrue(metadata.contains( Licence.CC_BY.getInternalIdentifierLicenceMetadatum() ) );
assertTrue(metadata.contains( PublicationStatus.PUBLISHED_VERSION.getInternalIdentifierPubstatusMetadatum() ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.language", "en" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.title.alternative", "Der Einsatz von Befragten-Incentives in einer bundesweiten face-to-face-Umfrage: Effekte auf Ausschöpfung, Stichprobenzusammensetzung und Feldarbeitsaufwand" ) ) );
}
{
Set<Metadatum> metadata = bss.getAllMetadata("Andreas.2018");
metadata.stream().map(Object::toString).forEach(LOG::info);
assertTrue(metadata.contains( new SimpleMetadatum("internal.dda.reference", "bibtex-bundlessource-4711@@Andreas.2018") ) );
assertTrue(metadata.contains( Stock.COLLECTION.getMetadatum() ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.publisher.city", "Lüneburg") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.editor", "Andreas, Michael") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.editor", "Kasprowicz, Dawid") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.editor", "Rieger, Stefan") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.source.pageinfo", "190") ) );
assertTrue(metadata.contains( Licence.CC_BY_SA_4_0.getInternalIdentifierLicenceMetadatum() ) );
}
{
Set<Metadatum> metadata = bss.getAllMetadata("Engelhardt.2018");
metadata.stream().map(Object::toString).forEach(LOG::info);
assertTrue(metadata.contains( new SimpleMetadatum("internal.dda.reference", "bibtex-bundlessource-4711@@Engelhardt.2018") ) );
assertTrue(metadata.contains( Stock.MONOGRAPH.getMetadatum() ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.identifier.urn", "urn:nbn:de:0168-ssoar-56985-5") ) );
assertTrue(metadata.contains( ReviewStatus.UNKNOWN.getInternalIdentifierReviewMetadatum() ) );
assertTrue(metadata.contains( Licence.DEPOSIT.getInternalIdentifierLicenceMetadatum() ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.corporateeditor", "Staatsinstitut für Familienforschung an der Universität Bamberg (ifb)") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.source.series", "Population and family studies") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.identifier.isbn", "978-3-86309-548-2") ) );
}
{
Set<Metadatum> metadata = bss.getAllMetadata("Lucke.2018");
metadata.stream().map(Object::toString).forEach(LOG::info);
assertTrue(metadata.contains( new SimpleMetadatum("internal.dda.reference", "bibtex-bundlessource-4711@@Lucke.2018") ) );
assertTrue(metadata.contains( Stock.INCOLLECTION.getMetadatum() ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.author", "Lucke, Jörn von") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.title", "Smart Government auf einem schmalen Grat") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.identifier.uri", "https://www.ssoar.info/ssoar/handle/document/57563") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.source.pageinfo", "97-125") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.identifier.isbn", "978-3-9818892-5-3") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.editor", "Mohabbat Kar, Resa") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.editor", "Thapa, Basanta E. P.") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.editor", "Parycek, Peter") ) );
// don't include encompassing collection's title
assertFalse(metadata.contains( new SimpleMetadatum("dc.title", "(Un)berechenbar? Algorithmen und Automatisierung in Staat und Gesellschaft") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.date.issued", "2018") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.publisher.city", "Berlin") ) );
assertTrue(metadata.contains( ReviewStatus.REVIEWED.getInternalIdentifierReviewMetadatum() ) );
assertTrue(metadata.contains( new SimpleMetadatum("internal.embargo.liftdate", "2003-05-01" ) ) );
assertTrue(metadata.contains( PublicationStatus.PUBLISHED_VERSION.getInternalIdentifierPubstatusMetadatum() ) );
assertTrue(metadata.contains( Licence.CC_BY_3_0.getInternalIdentifierLicenceMetadatum() ) );
// don't include encompassing collection's URN
assertFalse(metadata.contains( new SimpleMetadatum("dc.identifier.urn", "urn:nbn:de:0168-ssoar-57518-2") ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.language", "de" ) ) );
assertTrue(metadata.contains( new SimpleMetadatum("dc.identifier.urn", "urn:nbn:de:0168-ssoar-57563-2") ) );
// don't include encompassing collection's embargo date
assertFalse(metadata.contains( new SimpleMetadatum("internal.embargo.liftdate", "2007-05-01") ) );
// include encompassing collection's corporate editor
assertTrue(metadata.contains( new SimpleMetadatum("dc.contributor.corporateeditor", "Fraunhofer-Institut für Offene Kommunikationssysteme FOKUS, Kompetenzzentrum Öffentliche IT") ) );
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment