Commit 584204ef authored by Gerrit Hübbers's avatar Gerrit Hübbers 🃏
Browse files

Resolve DOIs and URNs to InputStream

parent 076060a3
......@@ -58,8 +58,10 @@
<spring-security.version>4.0.2.RELEASE</spring-security.version>
<springfox.version>2.0.3</springfox.version>
<poi.version>3.14</poi.version>
<jsoup.version>1.9.2</jsoup.version>
<poi-ooxml.version>3.14</poi-ooxml.version>
<jsoup.version>1.9.2</jsoup.version>
<simplemagic.version>1.8</simplemagic.version>
<mockftpserver.version>2.6</mockftpserver.version>
<!-- Sonar properties -->
<project.testresult.directory>${project.build.directory}/test-results</project.testresult.directory>
<sonar-maven-plugin.version>2.6</sonar-maven-plugin.version>
......@@ -378,10 +380,15 @@
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
<dependency>
<groupId>com.j256.simplemagic</groupId>
<artifactId>simplemagic</artifactId>
<version>${simplemagic.version}</version>
</dependency>
<dependency>
<groupId>org.mockftpserver</groupId>
<artifactId>MockFtpServer</artifactId>
<version>2.6</version>
<version>${mockftpserver.version}</version>
<scope>test</scope>
</dependency>
</dependencies>
......
......@@ -6,17 +6,27 @@ import java.net.URL;
import java.util.Set;
import org.gesis.dda.publishing.domain.Metadatum;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.j256.simplemagic.ContentInfo;
import com.j256.simplemagic.ContentInfoUtil;
import com.j256.simplemagic.ContentType;
public class ContentResolver {
private final static Logger log = LoggerFactory.getLogger(ContentResolver.class);
private final static ContentInfoUtil contentInfoUtil = new ContentInfoUtil();
public static InputStream resolveContentInputStream(Set<Metadatum> metadata) {
InputStream content = null;
for (Metadatum metadatum : metadata) {
if ( metadatum.getKey().equals("dc.identifier.url") ) {
String url = metadatum.getValue();
try {
......@@ -29,9 +39,14 @@ public class ContentResolver {
}
}
else if ( metadatum.getKey().equals("dc.identifier.urn") ) {
content = null;
String url = "http://nbn-resolving.de/" + metadatum.getValue(); // http://nbn-resolving.de/urn:nbn:de:0156-3953016
content = resolveLink(url);
if (null != content) break;
}
else if ( metadatum.getKey().equals("dc.identifier.doi") ) {
String url = "http://dx.doi.org/" + metadatum.getValue(); // http://dx.doi.org/10.1140/epjc/s10052-014-3060-7
content = resolveLink(url);
if (null != content) break;
content = null;
}
else {
......@@ -42,4 +57,34 @@ public class ContentResolver {
return content;
}
private static InputStream resolveLink(String url) {
InputStream content = null;
try {
InputStream possibleContent = new URL(url).openStream();
ContentInfo possibleContentInfo = contentInfoUtil.findMatch(possibleContent);
ContentType possibleContentType = possibleContentInfo.getContentType();
if ( possibleContentType.equals(ContentType.PDF) ) {
content = new URL(url).openStream();
}
else if ( possibleContentType.equals(ContentType.HTML)
|| possibleContentType.equals(ContentType.XHTML)
|| possibleContentType.equals(ContentType.XML) ) {
String hasHtmlLikeContentUrl = url;
Document doc = Jsoup.connect(hasHtmlLikeContentUrl).get();
Elements citationPdfUrlMetaElements = doc.select("meta[name=citation_pdf_url]");
if ( citationPdfUrlMetaElements.size() >= 1 ) {
Element firstCitationPdfUrlMetaElement = citationPdfUrlMetaElements.get(0);
String firstPdfUrlString = firstCitationPdfUrlMetaElement.absUrl("content");
URL firstPdfUrl = new URL(firstPdfUrlString);
content = firstPdfUrl.openStream();
}
}
}
catch (IOException e) {
log.warn("Could not find InputStream for URL {}", url);
}
return content;
}
}
package org.gesis.dda.publishing.domain.impl;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
......@@ -29,6 +29,9 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.ImmutableSet;
import com.j256.simplemagic.ContentInfo;
import com.j256.simplemagic.ContentInfoUtil;
import com.j256.simplemagic.ContentType;
public class ContentResolverTest {
......@@ -48,6 +51,18 @@ public class ContentResolverTest {
private static Metadatum DC_IDENTIFIER_URL_WITH_INVALID_URL_METADATUM;
private static Set<Metadatum> INVALID_FTP_METADATA;
private static Metadatum URN_TO_LANDING_PAGE_METADATUM;
private static Set<Metadatum> URN_TO_LANDING_PAGE_METADATA;
private static Metadatum URN_TO_PDF_METADATUM;
private static Set<Metadatum> URN_TO_PDF_METADATA;
private static Metadatum DOI_TO_LANDING_PAGE_METADATUM;
private static Set<Metadatum> DOI_TO_LANDING_PAGE_METADATA;
private static Metadatum DOI_TO_PDF_METADATUM;
private static Set<Metadatum> DOI_TO_PDF_METADATA;
@BeforeClass
public static void setupFtpServer() throws IOException {
FakeFtpServer fakeFtpServer = new FakeFtpServer();
......@@ -63,6 +78,18 @@ public class ContentResolverTest {
DC_IDENTIFIER_URL_WITH_INVALID_URL_METADATUM = new SimpleMetadatum("dc.identifier.url", FTP_SERVER_ROOT + "doesntExist.pdf");
INVALID_FTP_METADATA = ImmutableSet.of(DC_IDENTIFIER_URL_WITH_INVALID_URL_METADATUM);
URN_TO_LANDING_PAGE_METADATUM = new SimpleMetadatum("dc.identifier.urn", "urn:nbn:de:0111-pedocs-123709");
URN_TO_LANDING_PAGE_METADATA = ImmutableSet.of(URN_TO_LANDING_PAGE_METADATUM);
URN_TO_PDF_METADATUM = new SimpleMetadatum("dc.identifier.urn", "urn:nbn:de:0156-3953016");
URN_TO_PDF_METADATA = ImmutableSet.of(URN_TO_PDF_METADATUM);
DOI_TO_LANDING_PAGE_METADATUM = new SimpleMetadatum("dc.identifier.doi", "10.1140/epjc/s10052-014-3060-7");
DOI_TO_LANDING_PAGE_METADATA = ImmutableSet.of(DOI_TO_LANDING_PAGE_METADATUM);
DOI_TO_PDF_METADATUM = new SimpleMetadatum("dc.identifier.doi", "doi:10.1392/BC1.0");
DOI_TO_PDF_METADATA = ImmutableSet.of(DOI_TO_PDF_METADATUM);
UserAccount userAccount = new UserAccount(FTP_USERNAME, FTP_PASSWORD, FTP_USER_HOME_LOCATION);
userAccount.setAccountRequiredForLogin(false);
userAccount.setPasswordCheckedDuringValidation(false);
......@@ -136,4 +163,84 @@ public class ContentResolverTest {
boolean isEqual = IOUtils.contentEquals(expectedInputStream, firstPdfInputStream);
assertTrue(isEqual);
}
@Test
public void smokeTestSimpleMagic() throws IOException {
ContentInfoUtil util = new ContentInfoUtil();
InputStream is = getClass().getResourceAsStream("/Bundeskabinett_PDF.pdf");
ContentInfo info = util.findMatch(is);
log.info( info.toString() );
assertEquals( ContentType.PDF, info.getContentType() );
}
@Test
public void testSimpleMagicDSpaceHtmlBehavior() throws IOException {
ContentInfoUtil util = new ContentInfoUtil();
InputStream is = new URL("http://ssoar.svko-dda-test.gesis.intra/ssoar/handle/document/34309").openStream();
ContentInfo info = util.findMatch(is);
log.info( info.toString() );
assertEquals( ContentType.XML, info.getContentType() );
}
@Test
public void testSimpleMagicGoogleHtmlBehavior() throws IOException {
ContentInfoUtil util = new ContentInfoUtil();
InputStream is = new URL("http://www.google.com").openStream();
ContentInfo info = util.findMatch(is);
log.info( info.toString() );
assertEquals( ContentType.HTML, info.getContentType() );
}
@Test
public void testSimpleMagicPedocsBehavior() throws IOException {
ContentInfoUtil util = new ContentInfoUtil();
InputStream is = new URL("http://nbn-resolving.de/urn:nbn:de:0111-pedocs-123709").openStream();
ContentInfo info = util.findMatch(is);
log.info( info.toString() );
assertEquals( ContentType.HTML, info.getContentType() );
}
@Test
public void testDcIdentifierUrnViaLandingPage() throws IOException {
InputStream expectedInputStream = getClass().getResourceAsStream("/Dumont_et_al._2014_ZfE_A.pdf");
InputStream actualInputStream = ContentResolver.resolveContentInputStream(URN_TO_LANDING_PAGE_METADATA);
boolean isEqual = IOUtils.contentEquals(expectedInputStream, actualInputStream);
assertTrue(isEqual);
}
@Test
public void testDcIdentifierUrnDirectPdf() throws IOException {
InputStream expectedInputStream = getClass().getResourceAsStream("/ab_013_01.pdf");
InputStream actualInputStream = ContentResolver.resolveContentInputStream(URN_TO_PDF_METADATA);
boolean isEqual = IOUtils.contentEquals(expectedInputStream, actualInputStream);
assertTrue(isEqual);
}
@Test
public void testDcIdentifierDoiViaLandingPage() throws IOException {
InputStream expectedInputStream = getClass().getResourceAsStream("/art_10.1140_epjc_s10052-014-3060-7.pdf");
InputStream actualInputStream = ContentResolver.resolveContentInputStream(DOI_TO_LANDING_PAGE_METADATA);
boolean isEqual = IOUtils.contentEquals(expectedInputStream, actualInputStream);
assertTrue(isEqual);
}
@Test
public void testDcIdentifierDoiDirectPdf() throws IOException {
InputStream expectedInputStream = getClass().getResourceAsStream("/casalini_bc_en.pdf");
InputStream actualInputStream = ContentResolver.resolveContentInputStream(DOI_TO_PDF_METADATA);
boolean isEqual = IOUtils.contentEquals(expectedInputStream, actualInputStream);
assertTrue(isEqual);
}
}
......@@ -3,14 +3,15 @@
<configuration scan="true">
<property name="LOG_TEMP" value="./logs"/>
<include resource="org/springframework/boot/logging/logback/base.xml"/>
<logger name="org.gesis" level="INFO"/>
<logger name="org.gesis.dda.publishing" level="DEBUG"/>
<logger name="org.springframework.web.client.RestTemplate" level="INFO"/>
<logger name="org.hibernate.ejb.HibernatePersistence" level="OFF"/>
<logger name="org.apache.catalina.startup.DigesterFactory" level="OFF"/>
<root level="WARN">
<appender-ref ref="CONSOLE"/>
</root>
</configuration>
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment