Commit 679dc028 authored by Gerrit Hübbers's avatar Gerrit Hübbers 🃏
Browse files

Introduce Highwire Press-only Content Resolving Strategy

parent ac9beccd
......@@ -238,9 +238,9 @@ public class ContentResolver {
}
else if ( metadatumKey.equals("dc.identifier.doi") ) {
String url = metadatum.getValue(); // http://dx.doi.org/10.1140/epjc/s10052-014-3060-7
String url = metadatum.getValue(); // https://doi.org/10.1140/epjc/s10052-014-3060-7
if ( ! url.startsWith("http") ) {
url = "http://dx.doi.org/" + url;
url = "https://doi.org/" + url;
}
Set<String> foundContentsUrlStrings = resolveLinks(url);
contentsUrlStrings.addAll(foundContentsUrlStrings);
......@@ -328,6 +328,9 @@ public class ContentResolver {
// results from degruyter.com and rely on alternative PDF sources
result = new EmptySetResolvingStrategy();
}
else if ( seedUrlString.contains("librelloph.com") ) {
result = new HighWirePressCitationPdfUrlOnlyStrategy();
}
else {
result = new CatchAllResolvingStrategy();
}
......
package org.gesis.dda.publishing.domain.impl;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import org.gesis.dda.publishing.domain.ContentsUrlsResolvingStrategy;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.j256.simplemagic.ContentInfo;
import com.j256.simplemagic.ContentType;
/**
* Strategy to resolve correct PDF from several possible options. If several PDF
* files can be found on an HTML page, then this strategy will collect the nth
* PDF only.
*
* @author huebbegt
*
*/
public class HighWirePressCitationPdfUrlOnlyStrategy implements ContentsUrlsResolvingStrategy {
private final static Logger log = LoggerFactory.getLogger(HighWirePressCitationPdfUrlOnlyStrategy.class);
private int currentAttempt = 0;
private int maximumRetries = 3;
@Override
public Set<String> getContentsUrls(String seedUrl) {
Set<String> contentsUrlStrings = new HashSet<>();
currentAttempt = 0;
boolean wasSuccessful = false;
while ((currentAttempt <= maximumRetries) && (!wasSuccessful)) {
currentAttempt++;
try {
Document doc = Jsoup.connect(seedUrl).get();
Elements possiblePdfUrlElements = doc.select("meta[name=citation_pdf_url]");
for (Element possiblePdfUrlElement : possiblePdfUrlElements) {
String possiblePdfUrlString = possiblePdfUrlElement.absUrl("content");
String resolvedPossiblePdfUrlString = ContentHelpers.resolvePossibleRedirects(possiblePdfUrlString);
try {
URL resolvedPossiblePdfUrl = new URL(resolvedPossiblePdfUrlString);
InputStream resolvedPossiblePdfInputStream = ContentHelpers
.getTimeoutBackoffRetryAwareURLInputStream(resolvedPossiblePdfUrl);
ContentInfo ci = ContentHelpers.CONTENT_INFO_UTIL.findMatch(resolvedPossiblePdfInputStream);
if (null != ci) {
ContentType ct = ci.getContentType();
if (ContentType.PDF.equals(ct)) {
contentsUrlStrings.add(resolvedPossiblePdfUrlString);
wasSuccessful = true;
break;
} // if ( ContentType.PDF.equals(ct) ) {
else {
continue;
} // else
} // if (null != ci) {
} // try {
catch (IOException e) {
log.warn("Could not find InputStream for identified URL " + resolvedPossiblePdfUrlString, e);
handleGracefully();
} // catch (IOException e) {
} // for
} // try
catch (IOException e) {
log.warn("Could not find InputStream for seed URL " + seedUrl, e);
handleGracefully();
} // catch (IOException e) {
} // while ( (currentAttempt <= maximumRetries) && (! wasSuccessful) ) {
return contentsUrlStrings;
}
private void handleGracefully() {
if (currentAttempt <= maximumRetries) {
log.warn("WbvResolvingStrategy attempt " + currentAttempt + " failed. Going to sleep.");
sleep();
}
else {
log.error("Ran out of retries.");
}
}
private void sleep() {
try {
Thread.sleep(1000L * currentAttempt);
}
catch (InterruptedException e1) {
log.error("Interrupted during thread sleep", e1);
throw new RuntimeException(e1);
}
}
}
......@@ -690,7 +690,6 @@ public class ContentResolverTest {
assertTrue(isEqual);
}
@Test
public void resolveDeGruyterDoi() throws MalformedURLException, IOException {
String underTest = "https://doi.org/10.14361/9783839437124";
......@@ -701,6 +700,27 @@ public class ContentResolverTest {
assertEquals(0, suppliers.size());
}
@Test
public void resolveLibrelloChallengesInSustainabilityCis() throws MalformedURLException, IOException {
Metadatum metadatum1 = new SimpleMetadatum("dc.identifier.doi", "https://doi.org/10.12924/cis2017.05010043");
Metadatum metadatum2 = new SimpleMetadatum("dc.identifier.url", "http://www.librelloph.com/challengesinsustainability/article/view/cis-5.1.43");
Set<Metadatum> metadata = ImmutableSet.of(metadatum1, metadatum2);
InputStream expectedInputStream = new URL("http://www.librelloph.com/challengesinsustainability/article/download/cis-5.1.43/pdf").openStream();
assertNotNull(expectedInputStream);
Set<InputStreamAndFilenameSupplier> suppliers = ContentResolver.resolveContentsSuppliers(metadata);
assertEquals(1, suppliers.size());
InputStreamAndFilenameSupplier actualSupplier = suppliers.iterator().next();
LOG.info("resolved URL is {}", actualSupplier.getFilename() );
InputStream actualInputStream = actualSupplier.get();
LOG.info("actualInputStream={}", actualInputStream);
assertNotNull(actualInputStream);
boolean isEqual = IOUtils.contentEquals(actualInputStream, expectedInputStream);
assertTrue(isEqual);
}
public static String unrootDomainName(String url) {
try {
String result;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment