Commit 4c7ec5b3 authored by Gerrit Hübbers's avatar Gerrit Hübbers 🃏
Browse files

Introduce dependency for downloading PDFs from HTML meta tags.

parent b32327fe
......@@ -58,6 +58,7 @@
<spring-security.version>4.0.2.RELEASE</spring-security.version>
<springfox.version>2.0.3</springfox.version>
<poi.version>3.14</poi.version>
<jsoup.version>1.9.2</jsoup.version>
<poi-ooxml.version>3.14</poi-ooxml.version>
<!-- Sonar properties -->
<project.testresult.directory>${project.build.directory}/test-results</project.testresult.directory>
......@@ -372,6 +373,11 @@
<artifactId>poi-ooxml</artifactId>
<version>${poi-ooxml.version}</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>${jsoup.version}</version>
</dependency>
<dependency>
<groupId>org.mockftpserver</groupId>
<artifactId>MockFtpServer</artifactId>
......
......@@ -3,6 +3,8 @@ package org.gesis.dda.publishing.domain.impl;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
......@@ -10,6 +12,10 @@ import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.gesis.dda.publishing.domain.Metadatum;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.BeforeClass;
import org.junit.Test;
import org.mockftpserver.fake.FakeFtpServer;
......@@ -109,4 +115,25 @@ public class ContentResolverTest {
InputStream actualInputStream = ContentResolver.resolveContentInputStream(INVALID_FTP_METADATA);
assertNull(actualInputStream);
}
@Test
public void smokeTestJsoup() throws IOException {
Document doc = Jsoup.connect("http://en.wikipedia.org/").get();
Elements newsHeadlines = doc.select("#mp-itn b a");
log.info("{}", newsHeadlines);
}
@Test
public void testCitationPdfUrl() throws IOException {
Document doc = Jsoup.connect("https://arxiv.org/abs/1404.5997").get();
Elements citationPdfUrlMetaElements = doc.select("meta[name=citation_pdf_url]");
Element firstCitationPdfUrlMetaElement = citationPdfUrlMetaElements.get(0);
String firstPdfUrlString = firstCitationPdfUrlMetaElement.absUrl("content");
log.info("{}", firstPdfUrlString);
URL firstPdfUrl = new URL(firstPdfUrlString);
InputStream firstPdfInputStream = firstPdfUrl.openStream();
InputStream expectedInputStream = new FileInputStream("src/test/resources/1404.5997.pdf");
boolean isEqual = IOUtils.contentEquals(expectedInputStream, firstPdfInputStream);
assertTrue(isEqual);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment