Initial Commit

This commit is contained in:
Bommels05 2025-11-10 20:06:06 +01:00
commit 7ca161caeb
18 changed files with 1401 additions and 0 deletions

46
.gitignore vendored Normal file
View File

@ -0,0 +1,46 @@
.gradle
build/
!gradle/wrapper/gradle-wrapper.jar
!**/src/main/**/build/
!**/src/test/**/build/
### IntelliJ IDEA ###
.idea/modules.xml
.idea/jarRepositories.xml
.idea/compiler.xml
.idea/libraries/
*.iws
*.iml
*.ipr
out/
!**/src/main/**/out/
!**/src/test/**/out/
### Eclipse ###
.apt_generated
.classpath
.factorypath
.project
.settings
.springBeans
.sts4-cache
bin/
!**/src/main/**/bin/
!**/src/test/**/bin/
### NetBeans ###
/nbproject/private/
/nbbuild/
/dist/
/nbdist/
/.nb-gradle/
### VS Code ###
.vscode/
### Mac OS ###
.DS_Store
/run
/run_

42
build.gradle Normal file
View File

@ -0,0 +1,42 @@
plugins {
id 'java'
id 'application'
id("com.gradleup.shadow") version "9.0.0-rc1"
}
group = 'de.bommels05'
version = '1.0-SNAPSHOT'
repositories {
mavenCentral()
mavenLocal()
}
dependencies {
testImplementation platform('org.junit:junit-bom:5.10.0')
testImplementation 'org.junit.jupiter:junit-jupiter'
implementation("org.slf4j:slf4j-simple:2.0.12")
implementation("io.javalin:javalin:6.7.0")
implementation("com.google.code.gson:gson:2.10.1")
implementation("com.github.ben-manes.caffeine:caffeine:3.2.2")
implementation("org.jsoup:jsoup:1.21.1")
implementation("it.uniroma1.dis.wsngroup.gexf4j:gexf4j:1.0.0")
implementation("de.bommels05:DBLib:1.0-SNAPSHOT")
}
application {
mainClass = 'de.bommels05.befatorweb.BefatorWeb'
}
shadowJar {
manifest {
attributes 'Main-Class': 'de.bommels05.befatorweb.BefatorWeb'
}
}
test {
useJUnitPlatform()
}

BIN
gradle/wrapper/gradle-wrapper.jar vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,6 @@
#Tue Feb 25 20:26:04 CET 2025
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-8.11-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

234
gradlew vendored Normal file
View File

@ -0,0 +1,234 @@
#!/bin/sh
#
# Copyright © 2015-2021 the original authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
##############################################################################
#
# Gradle start up script for POSIX generated by Gradle.
#
# Important for running:
#
# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is
# noncompliant, but you have some other compliant shell such as ksh or
# bash, then to run this script, type that shell name before the whole
# command line, like:
#
# ksh Gradle
#
# Busybox and similar reduced shells will NOT work, because this script
# requires all of these POSIX shell features:
# * functions;
# * expansions «$var», «${var}», «${var:-default}», «${var+SET}»,
# «${var#prefix}», «${var%suffix}», and «$( cmd )»;
# * compound commands having a testable exit status, especially «case»;
# * various built-in commands including «command», «set», and «ulimit».
#
# Important for patching:
#
# (2) This script targets any POSIX shell, so it avoids extensions provided
# by Bash, Ksh, etc; in particular arrays are avoided.
#
# The "traditional" practice of packing multiple parameters into a
# space-separated string is a well documented source of bugs and security
# problems, so this is (mostly) avoided, by progressively accumulating
# options in "$@", and eventually passing that to Java.
#
# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS,
# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly;
# see the in-line comments for details.
#
# There are tweaks for specific operating systems such as AIX, CygWin,
# Darwin, MinGW, and NonStop.
#
# (3) This script is generated from the Groovy template
# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt
# within the Gradle project.
#
# You can find Gradle at https://github.com/gradle/gradle/.
#
##############################################################################
# Attempt to set APP_HOME
# Resolve links: $0 may be a link
app_path=$0
# Need this for daisy-chained symlinks.
while
APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path
[ -h "$app_path" ]
do
ls=$( ls -ld "$app_path" )
link=${ls#*' -> '}
case $link in #(
/*) app_path=$link ;; #(
*) app_path=$APP_HOME$link ;;
esac
done
APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit
APP_NAME="Gradle"
APP_BASE_NAME=${0##*/}
# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
# Use the maximum available, or set MAX_FD != -1 to use that value.
MAX_FD=maximum
warn () {
echo "$*"
} >&2
die () {
echo
echo "$*"
echo
exit 1
} >&2
# OS specific support (must be 'true' or 'false').
cygwin=false
msys=false
darwin=false
nonstop=false
case "$( uname )" in #(
CYGWIN* ) cygwin=true ;; #(
Darwin* ) darwin=true ;; #(
MSYS* | MINGW* ) msys=true ;; #(
NONSTOP* ) nonstop=true ;;
esac
CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
# Determine the Java command to use to start the JVM.
if [ -n "$JAVA_HOME" ] ; then
if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
# IBM's JDK on AIX uses strange locations for the executables
JAVACMD=$JAVA_HOME/jre/sh/java
else
JAVACMD=$JAVA_HOME/bin/java
fi
if [ ! -x "$JAVACMD" ] ; then
die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
else
JAVACMD=java
which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
Please set the JAVA_HOME variable in your environment to match the
location of your Java installation."
fi
# Increase the maximum file descriptors if we can.
if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then
case $MAX_FD in #(
max*)
MAX_FD=$( ulimit -H -n ) ||
warn "Could not query maximum file descriptor limit"
esac
case $MAX_FD in #(
'' | soft) :;; #(
*)
ulimit -n "$MAX_FD" ||
warn "Could not set maximum file descriptor limit to $MAX_FD"
esac
fi
# Collect all arguments for the java command, stacking in reverse order:
# * args from the command line
# * the main class name
# * -classpath
# * -D...appname settings
# * --module-path (only if needed)
# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables.
# For Cygwin or MSYS, switch paths to Windows format before running java
if "$cygwin" || "$msys" ; then
APP_HOME=$( cygpath --path --mixed "$APP_HOME" )
CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" )
JAVACMD=$( cygpath --unix "$JAVACMD" )
# Now convert the arguments - kludge to limit ourselves to /bin/sh
for arg do
if
case $arg in #(
-*) false ;; # don't mess with options #(
/?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath
[ -e "$t" ] ;; #(
*) false ;;
esac
then
arg=$( cygpath --path --ignore --mixed "$arg" )
fi
# Roll the args list around exactly as many times as the number of
# args, so each arg winds up back in the position where it started, but
# possibly modified.
#
# NB: a `for` loop captures its iteration list before it begins, so
# changing the positional parameters here affects neither the number of
# iterations, nor the values presented in `arg`.
shift # remove old arg
set -- "$@" "$arg" # push replacement arg
done
fi
# Collect all arguments for the java command;
# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of
# shell script including quotes and variable substitutions, so put them in
# double quotes to make sure that they get re-expanded; and
# * put everything else in single quotes, so that it's not re-expanded.
set -- \
"-Dorg.gradle.appname=$APP_BASE_NAME" \
-classpath "$CLASSPATH" \
org.gradle.wrapper.GradleWrapperMain \
"$@"
# Use "xargs" to parse quoted args.
#
# With -n1 it outputs one arg per line, with the quotes and backslashes removed.
#
# In Bash we could simply go:
#
# readarray ARGS < <( xargs -n1 <<<"$var" ) &&
# set -- "${ARGS[@]}" "$@"
#
# but POSIX shell has neither arrays nor command substitution, so instead we
# post-process each arg (as a line of input to sed) to backslash-escape any
# character that might be a shell metacharacter, then use eval to reverse
# that process (while maintaining the separation between arguments), and wrap
# the whole thing up as a single "set" statement.
#
# This will of course break if any of these variables contains a newline or
# an unmatched quote.
#
eval "set -- $(
printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" |
xargs -n1 |
sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' |
tr '\n' ' '
)" '"$@"'
exec "$JAVACMD" "$@"

89
gradlew.bat vendored Normal file
View File

@ -0,0 +1,89 @@
@rem
@rem Copyright 2015 the original author or authors.
@rem
@rem Licensed under the Apache License, Version 2.0 (the "License");
@rem you may not use this file except in compliance with the License.
@rem You may obtain a copy of the License at
@rem
@rem https://www.apache.org/licenses/LICENSE-2.0
@rem
@rem Unless required by applicable law or agreed to in writing, software
@rem distributed under the License is distributed on an "AS IS" BASIS,
@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@rem See the License for the specific language governing permissions and
@rem limitations under the License.
@rem
@if "%DEBUG%" == "" @echo off
@rem ##########################################################################
@rem
@rem Gradle startup script for Windows
@rem
@rem ##########################################################################
@rem Set local scope for the variables with windows NT shell
if "%OS%"=="Windows_NT" setlocal
set DIRNAME=%~dp0
if "%DIRNAME%" == "" set DIRNAME=.
set APP_BASE_NAME=%~n0
set APP_HOME=%DIRNAME%
@rem Resolve any "." and ".." in APP_HOME to make it shorter.
for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
@rem Find java.exe
if defined JAVA_HOME goto findJavaFromJavaHome
set JAVA_EXE=java.exe
%JAVA_EXE% -version >NUL 2>&1
if "%ERRORLEVEL%" == "0" goto execute
echo.
echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:findJavaFromJavaHome
set JAVA_HOME=%JAVA_HOME:"=%
set JAVA_EXE=%JAVA_HOME%/bin/java.exe
if exist "%JAVA_EXE%" goto execute
echo.
echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
echo.
echo Please set the JAVA_HOME variable in your environment to match the
echo location of your Java installation.
goto fail
:execute
@rem Setup the command line
set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
@rem Execute Gradle
"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
:end
@rem End local scope for the variables with windows NT shell
if "%ERRORLEVEL%"=="0" goto mainEnd
:fail
rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
rem the _cmd.exe /c_ return code!
if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
exit /b 1
:mainEnd
if "%OS%"=="Windows_NT" endlocal
:omega

2
settings.gradle Normal file
View File

@ -0,0 +1,2 @@
rootProject.name = 'BefatorWeb'

View File

@ -0,0 +1,220 @@
package de.bommels05.befatorweb;
import de.bommels05.befatorweb.links.LinkDatabase;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.nio.charset.StandardCharsets;
import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class BefatorRewriter {
private static final Logger LOGGER = LoggerFactory.getLogger(BefatorRewriter.class);
private final Consumer<String> pageRequester;
public BefatorRewriter(Consumer<String> pageRequester) {
this.pageRequester = pageRequester;
}
public byte[] rewrite(byte[] input, String url, SavedPaged.ContentType contentType) {
String inputString = new String(input, StandardCharsets.UTF_8);
if (contentType == SavedPaged.ContentType.HTML) {
LOGGER.info("Starting Rewrite...");
Document html = Jsoup.parse(inputString);
boolean foundIcon = false;
Elements dataLinks = html.getElementsByTag("link");
for (Element link : dataLinks) {
Attribute href = link.attribute("href");
if (href != null) {
Attribute rel = link.attribute("rel");
if (rel != null) {
String newLink = rewriteLink(href.getValue());
if (rel.getValue().equalsIgnoreCase("stylesheet")) {
LOGGER.info("Rewriting Stylesheet Link: {} -> {}", href.getValue(), newLink);
href.setValue(newLink);
} else if (rel.getValue().equals("icon")) {
foundIcon = true;
LOGGER.info("Rewriting Favicon Link: {} -> {}", href.getValue(), newLink);
href.setValue(newLink);
}
}
}
}
Elements metas = html.getElementsByTag("meta");
for (Element meta : metas) {
Attribute content = meta.attribute("content");
if (content != null) {
Attribute type = meta.attribute("http-equiv");
if (type != null) {
if (type.getValue().equalsIgnoreCase("refresh")) {
String link = content.getValue().substring(content.getValue().indexOf("=") + 1);
String newLink = rewriteLink(link);
LOGGER.info("Rewriting <meta> refresh Link: {} -> {}", link, newLink);
content.setValue(content.getValue().replaceFirst(link, newLink));
}
}
}
}
Elements bases = html.getElementsByTag("base");
for (Element base : bases) {
Attribute href = base.attribute("href");
if (href != null) {
String newLink = rewriteLink(href.getValue());
LOGGER.info("Rewriting <base> Link: {} -> {}", href.getValue(), newLink);
href.setValue(newLink);
}
}
Elements scripts = html.getElementsByTag("script");
for (Element script : scripts) {
Attribute src = script.attribute("src");
if (src != null) {
String newLink = rewriteLink(src.getValue());
LOGGER.info("Rewriting <script> Link: {} -> {}", src.getValue(), newLink);
src.setValue(newLink);
}
}
Elements images = html.getElementsByTag("img");
for (Element img : images) {
Attribute src = img.attribute("src");
if (src != null) {
String newLink = rewriteLink(src.getValue());
LOGGER.info("Rewriting <img> Link: {} -> {}", src.getValue(), newLink);
src.setValue(newLink);
}
}
Elements links = html.getElementsByTag("a");
for (Element link : links) {
Attribute href = link.attribute("href");
if (href != null) {
String newLink = rewriteLink(href.getValue());
LOGGER.info("Rewriting <a> Link: {} -> {}", href.getValue(), newLink);
href.setValue(newLink);
LinkDatabase.addLink(url, newLink);
}
}
Elements iframes = html.getElementsByTag("iframe");
for (Element iframe : iframes) {
Attribute src = iframe.attribute("src");
if (src != null) {
String newLink = rewriteLink(src.getValue());
LOGGER.info("Rewriting <iframe> Link: {} -> {}", src.getValue(), newLink);
src.setValue(newLink);
}
}
Elements styles = html.getElementsByTag("style");
for (Element style : styles) {
style.html(new String(rewrite(style.html().getBytes(StandardCharsets.UTF_8), url, SavedPaged.ContentType.CSS), StandardCharsets.UTF_8));
}
if (!foundIcon) {
html.head().append("<link rel=\"icon\" href=\"/proxy/" + (url.contains("/") ? url.substring(0, url.indexOf("/")) : url) + "/favicon.ico\">");
}
input = html.toString().getBytes(StandardCharsets.UTF_8);
} else if (contentType == SavedPaged.ContentType.CSS) {
Matcher matcher = Pattern.compile("url\\([\"']?(.*?)[\"']?\\)").matcher(inputString);
while (matcher.find()) {
String newLink = rewriteLink(matcher.group(1));
LOGGER.info("Rewriting CSS url() Link: {} -> {}", matcher.group(1), newLink);
inputString = inputString.replace(matcher.group(), "url(" + newLink + ")");
}
matcher = Pattern.compile("@import \"(.*?)\"").matcher(inputString);
while (matcher.find()) {
String newLink = rewriteLink(matcher.group(1));
LOGGER.info("Rewriting CSS @import Link: {} -> {}", matcher.group(1), newLink);
inputString = inputString.replace(matcher.group(), "@import \"" + newLink + "\"");
}
input = inputString.getBytes(StandardCharsets.UTF_8);
}
if (contentType == SavedPaged.ContentType.HTML || contentType == SavedPaged.ContentType.CSS || contentType == SavedPaged.ContentType.JS) {
inputString = new String(input, StandardCharsets.UTF_8);
while (inputString.contains("\"https://web.archive.org/web/")) {
int i = inputString.indexOf("\"https://web.archive.org/web/");
String link = inputString.substring(i + 1, inputString.indexOf("\"", i + 1));
String newLink = rewriteLink(link);
LOGGER.info("Rewriting undetected Link: {} -> {}", link, newLink);
inputString = inputString.replace("\"" + link + "\"", newLink);
}
}
return input;
}
public static String rewriteLink(String input) {
if (input.startsWith("https://web-static.archive.org")) {
return input/*.replace("https://web-static.archive.org", "/invalid")*/;
}
if (input.startsWith("https://web.archive.org")) {
return rewriteLink(input.replace("https://web.archive.org", ""));
}
if (input.startsWith("/web/")) {
String s = input.replaceFirst("/web/", "");
String original = s.substring(s.indexOf("/") + 1);
String newLink = rewriteLink(original);
if (!newLink.startsWith("/proxy/")) {
newLink = "/proxy/" + newLink;
}
return newLink;
}
if (input.startsWith("http://")) {
return input.replaceFirst("http://", "/proxy/");
}
if (input.startsWith("https://")) {
return input.replaceFirst("https://", "/proxy/");
}
return input;
}
public static boolean sameTargets(String link1, String link2) {
if (link1.startsWith("https://web.archive.org/web/") && link2.startsWith("https://web.archive.org/web/")) {
link1 = link1.replaceFirst("https://web.archive.org/web/", "");
link1 = link1.substring(link1.indexOf("/") + 1);
link2 = link2.replaceFirst("https://web.archive.org/web/", "");
link2 = link2.substring(link2.indexOf("/") + 1);
if (link1.startsWith("http://")) {
link1 = link1.replaceFirst("http://", "");
}
if (link1.startsWith("https://")) {
link1 = link1.replaceFirst("https://", "");
}
if (link2.startsWith("http://")) {
link2 = link2.replaceFirst("http://", "");
}
if (link2.startsWith("https://")) {
link2 = link2.replaceFirst("https://", "");
}
return link1/*.replaceAll("/", "")*/.equals(link2/*.replaceAll("/", "")*/);
}
return link1.equals(link2);
}
public static String stripProtocol(String url) {
if (url.startsWith("https://")) {
return url.replaceFirst("https://", "");
} else if (url.startsWith("http://")) {
return url.replaceFirst("http://", "");
}
return url;
}
}

View File

@ -0,0 +1,188 @@
package de.bommels05.befatorweb;
import com.github.benmanes.caffeine.cache.Caffeine;
import com.github.benmanes.caffeine.cache.LoadingCache;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import de.bommels05.befatorweb.links.LinkDatabase;
import io.javalin.Javalin;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;
public class BefatorWeb {
public static final int TARGET_YEAR = 2004;
public static final int TARGET_MONTH = Calendar.OCTOBER;
public static final int REQUEST_DELAY = 5000;
public static final String TARGET_URL = "www.microsoft.com/"/*"www.mozilla.org/releases/mozilla1.0/"*/;
private static final Logger LOGGER = LoggerFactory.getLogger(BefatorWeb.class);
private static final LoadingCache<String, String> urlCache = Caffeine.newBuilder().build(BefatorWeb::getClosestArchiveUrl);
private static final ReentrantLock requestLock = new ReentrantLock();
public static void main(String[] args) {
Javalin app = Javalin.create().start(3636);
app.get("/", ctx -> {
ctx.result("Befator Inc grüßt Sie!");
});
app.get("/api/progressMessage", ctx -> {
String target = ctx.queryParam("targetPage") == null ? TARGET_URL : ctx.queryParam("targetPage");
String url = ctx.queryParam("currentPage");
if (url == null) {
ctx.result("Dein aktuelles Ziel ist: " + target);
} else {
int distance = LinkDatabase.getDistance(url, target);
if (distance == -1) {
ctx.result("Es ist (noch) kein sicherer Weg zu deinem Ziel (" + target + ") bekannt");
} else {
ctx.result("Du bist " + distance + " Links von deinem Ziel (" + target + ") entfernt");
}
}
});
/*app.before(ctx -> {
String path = ctx.path();
if (!path.replaceFirst("/proxy/", "").contains("/")/* && !path.matches(".*\\.[a-zA-Z0-9]+$")*//*) {
String query = ctx.queryString() != null ? "?" + ctx.queryString() : "";
ctx.redirect(path + "/" + query);
}
});*/
app.get("/proxy/*", ctx -> {
String url = BefatorRewriter.stripProtocol(ctx.path().replaceFirst("/proxy/", ""));
/*if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
}*/
url = url + (ctx.queryString() != null ? "?" + ctx.queryString() : "");
if (Pattern.compile("^[^\\/.]+(?:\\/|$)").matcher(url).find()) {
LOGGER.warn("Tried to request invalid Page {}", url);
ctx.status(404);
return;
}
LOGGER.info("Requesting Page {}", url);
if (!SiteCache.containsPage(url)) {
try {
requestLock.lock();
String archiveUrl = urlCache.get(url);
if (archiveUrl == null) {
ctx.result("Page not found!");
ctx.status(404);
return;
} else {
PageDownloadData content = downloadPage(archiveUrl);
SavedPaged.ContentType contentType = content.contentType();
LOGGER.info("Downloaded page {} from {} with type {}", url, archiveUrl, contentType);
SiteCache.addPage(url, content.content(), contentType, content.status());
}
} finally {
requestLock.unlock();
}
}
SavedPaged page = SiteCache.getPage(url);
if (page.status() == SavedPaged.StatusCode.REDIRECT) {
ctx.redirect(BefatorRewriter.rewriteLink(new String(page.getContent(), StandardCharsets.UTF_8)));
return;
}
ctx.result(page.getContent());
ctx.res().setCharacterEncoding(StandardCharsets.UTF_8.name());
ctx.contentType(page.type().toString());
});
}
private static PageDownloadData downloadPage(String url) {
try {
//Thread.sleep(REQUEST_DELAY);
URLConnection connection = new URL(url).openConnection();
connection.connect();
connection.getInputStream();
String finalLocation = connection.getURL().toString();
if (!BefatorRewriter.sameTargets(finalLocation, url)) {
LinkDatabase.addLink(BefatorRewriter.rewriteLink(url), BefatorRewriter.rewriteLink(finalLocation));
return new PageDownloadData(finalLocation.getBytes(StandardCharsets.UTF_8), SavedPaged.ContentType.REDIRECT, SavedPaged.StatusCode.REDIRECT);
} else {
return new PageDownloadData(connection.getInputStream().readAllBytes(), SavedPaged.ContentType.fromString(connection.getContentType()), SavedPaged.StatusCode.OK);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static JsonObject getJsonContent(String url) {
try {
URLConnection connection = new URL(url).openConnection();
connection.connect();
return new Gson().fromJson(new String(connection.getInputStream().readAllBytes(), StandardCharsets.UTF_8), JsonObject.class);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static String toTwoDigitString(int i) {
return i < 10 ? "0" + i : String.valueOf(i);
}
private static String getClosestArchiveUrl(String url) {
String archiveUrl = getClosestArchiveUrl(url, TARGET_YEAR, TARGET_MONTH);
if (archiveUrl == null) {
return getClosestArchiveUrl(url, TARGET_YEAR + 1, 1);
}
return archiveUrl;
}
private static String getClosestArchiveUrl(String url, int targetYear, int targetMonth) {
String encoded = URLEncoder.encode(url, StandardCharsets.UTF_8);
JsonObject yearInfo = getJsonContent("https://web.archive.org/__wb/calendarcaptures/2?url=" + encoded + "&date=" + targetYear + "&groupby=day");
if (yearInfo.has("items")) {
JsonArray days = yearInfo.get("items").getAsJsonArray();
if (!days.isEmpty()) {
Calendar calendar = Calendar.getInstance();
calendar.setTimeInMillis(0);
List<Long> times = new ArrayList<>();
for (JsonElement day : days) {
char[] chars = day.getAsJsonArray().get(0).getAsString().toCharArray();
int dayOfMonth = Integer.parseInt(String.valueOf(chars[chars.length - 2]) + chars[chars.length - 1]);
int month = Integer.parseInt(chars.length == 3 ? String.valueOf(chars[0]) : String.valueOf(chars[0]) + chars[1]);
calendar.set(targetYear, month - 1, dayOfMonth);
times.add(calendar.getTimeInMillis());
}
calendar.set(targetYear, targetMonth, 15);
Long closest = times.stream().map(millis -> millis - calendar.getTimeInMillis()).map(millis -> millis < 0 ? -millis : millis).sorted().findFirst().orElseThrow();
calendar.setTimeInMillis(calendar.getTimeInMillis() + (times.contains(closest + calendar.getTimeInMillis()) ? closest : -closest));
JsonObject dayInfo = getJsonContent("https://web.archive.org/__wb/calendarcaptures/2?url=" + encoded + "&date=" + targetYear + toTwoDigitString(calendar.get(Calendar.MONTH) + 1) + toTwoDigitString(calendar.get(Calendar.DAY_OF_MONTH)));
String timeOfDay = dayInfo.get("items").getAsJsonArray().get(0).getAsJsonArray().get(0).getAsString();
timeOfDay = timeOfDay.length() == 5 ? "0" + timeOfDay : timeOfDay;
return "https://web.archive.org/web/" + targetYear + toTwoDigitString(calendar.get(Calendar.MONTH) + 1) + toTwoDigitString(calendar.get(Calendar.DAY_OF_MONTH)) + timeOfDay + "/" + url;
}
}
return null;
}
private record PageDownloadData(byte[] content, SavedPaged.ContentType contentType, SavedPaged.StatusCode status) {}
}

View File

@ -0,0 +1,79 @@
package de.bommels05.befatorweb;
public record SavedPaged(String url, ContentType type, StatusCode status) {
public byte[] getContent() {
return SiteCache.getPageContent(url);
}
public enum ContentType {
HTML("text/html"),
CSS("text/css"),
JS("application/x-javascript", "text/javascript"),
GIF("image/gif"),
JPEG("image/jpeg"),
PNG("image/png"),
ICO("image/x-icon"),
BINARY("application/octet-stream"),
PLAIN("text/plain"),
REDIRECT("befator/redirect");
private final String value;
private final String[] aliases;
ContentType(String value, String... aliases) {
this.value = value;
this.aliases = aliases;
}
public static ContentType fromString(String contentType) {
if (contentType == null) {
return PLAIN;
}
for (ContentType type : values()) {
if (contentType.startsWith(type.value)) {
return type;
}
for (String alias : type.aliases) {
if (contentType.startsWith(alias)) {
return type;
}
}
}
throw new IllegalArgumentException("Invalid content type: " + contentType);
}
@Override
public String toString() {
return value;
}
}
public enum StatusCode {
OK(200),
REDIRECT(301);
private final int value;
StatusCode(int value) {
this.value = value;
}
public static StatusCode fromInt(int status) {
for (StatusCode statusCode : values()) {
if (status == statusCode.value) {
return statusCode;
}
}
throw new IllegalArgumentException("Invalid status code: " + status);
}
public int toInt() {
return value;
}
}
}

View File

@ -0,0 +1,110 @@
package de.bommels05.befatorweb;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
public class SiteCache {
private static final Logger LOGGER = LoggerFactory.getLogger(SiteCache.class);
private static final List<SavedPaged> pages = new ArrayList<>();
private static final BefatorRewriter rewriter = new BefatorRewriter(url -> {});
static {
try {
File cache = new File("cache");
if (!cache.exists()) {
Files.createDirectory(cache.toPath());
}
File cacheIndex = new File("cache_index.json");
if (cacheIndex.exists()) {
loadIndex();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}
public static SavedPaged getPage(String url) {
for (SavedPaged page : pages) {
if (page.url().equals(url)) {
return page;
}
}
return null;
}
public static boolean containsPage(String url) {
return getPage(url) != null;
}
public static byte[] getPageContent(String url) {
return readFile("cache/" + pages.indexOf(getPage(url)));
}
public static void addPage(String url, byte[] content, SavedPaged.ContentType contentType, SavedPaged.StatusCode status) {
if (!containsPage(url)) {
pages.add(new SavedPaged(url, contentType, status));
overwriteFile("cache/" + pages.indexOf(getPage(url)), rewriter.rewrite(content, url, contentType));
saveIndex();
} else {
throw new IllegalArgumentException("Page " + url + " is already cached");
}
}
private static byte[] readFile(String fileName) {
try {
FileInputStream stream = new FileInputStream(fileName);
byte[] content = stream.readAllBytes();
stream.close();
return content;
} catch (FileNotFoundException e) {
LOGGER.error("Cache seems to be invalid - Resetting", e);
pages.clear();
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static void overwriteFile(String fileName, byte[] content) {
try {
File file = new File(fileName);
Files.deleteIfExists(file.toPath());
FileOutputStream stream = new FileOutputStream(file);
stream.write(content);
stream.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private static void saveIndex() {
Gson gson = new Gson();
JsonObject index = new JsonObject();
JsonArray array = new JsonArray();
for (SavedPaged page : pages) {
array.add(gson.toJsonTree(page));
}
index.add("pages", array);
overwriteFile("cache_index.json", gson.toJson(index).getBytes(StandardCharsets.UTF_8));
}
private static void loadIndex() {
Gson gson = new Gson();
JsonObject index = gson.fromJson(new String(readFile("cache_index.json"), StandardCharsets.UTF_8), JsonObject.class);
JsonArray array = index.get("pages").getAsJsonArray();
for (JsonElement page : array) {
pages.add(gson.fromJson(page, SavedPaged.class));
}
}
}

View File

@ -0,0 +1,68 @@
package de.bommels05.befatorweb.links;
import it.uniroma1.dis.wsngroup.gexf4j.core.Gexf;
import it.uniroma1.dis.wsngroup.gexf4j.core.Graph;
import it.uniroma1.dis.wsngroup.gexf4j.core.Mode;
import it.uniroma1.dis.wsngroup.gexf4j.core.Node;
import it.uniroma1.dis.wsngroup.gexf4j.core.impl.GexfImpl;
import it.uniroma1.dis.wsngroup.gexf4j.core.impl.StaxGraphWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class GexfExporter {
public static void main(String[] args) throws IOException {
Gexf gexf = new GexfImpl();
gexf.setVisualization(true);
Graph graph = gexf.getGraph();
graph.setMode(Mode.DYNAMIC);
List<String> pages = new ArrayList<>();
for (LinkDBEntry linkDBEntry : LinkDatabase.LINKS_TABLE.getAll()) {
String source = linkDBEntry.getSource();
String destination = linkDBEntry.getDestination();
if (!pages.contains(source)) {
pages.add(source);
}
if (!pages.contains(destination)) {
pages.add(destination);
}
}
Map<String, Node> nodes = new HashMap<>();
for (String page : pages) {
//if (!LinkDatabase.getLinks(page).toList().isEmpty()) {
Node node = graph.createNode(String.valueOf(pages.indexOf(page)));
node.setLabel(page);
node.setSize(LinkDatabase.LINKS_TABLE.getAll().stream().filter(link -> link.getDestination().equals(page)).toList().size());
nodes.put(page, node);
//}
}
for (String page : nodes.keySet()) {
Node node = nodes.get(page);
LinkDatabase.getLinks(page).forEach(s -> {
Node target = nodes.get(s);
if (target != null) {
node.connectTo(target);
}
});
}
StaxGraphWriter graphWriter = new StaxGraphWriter();
File file = new File("links.gexf");
FileOutputStream out = new FileOutputStream(file);
graphWriter.writeToStream(gexf, out, "UTF-8");
out.close();
}
}

View File

@ -0,0 +1,34 @@
package de.bommels05.befatorweb.links;
import de.bommels05.dblib.core.DBEntry;
import de.bommels05.dblib.core.DBEntryField;
import de.bommels05.dblib.core.QueryHolder;
import de.bommels05.dblib.core.Table;
public class LinkDBEntry extends DBEntry<LinkDBEntry, Integer> {
@DBEntryField(name = "source")
private String source;
@DBEntryField(name = "destination")
private String destination;
protected LinkDBEntry(Table<LinkDBEntry, Integer> table, QueryHolder queryHolder) {
super(table, queryHolder);
finalize(this);
fill();
}
protected LinkDBEntry(Table<LinkDBEntry, Integer> table, String source, String destination) {
super(table, null);
this.source = source;
this.destination = destination;
finalize(this);
}
public String getSource() {
return source;
}
public String getDestination() {
return destination;
}
}

View File

@ -0,0 +1,133 @@
package de.bommels05.befatorweb.links;
import de.bommels05.befatorweb.BefatorRewriter;
import de.bommels05.befatorweb.links.calc.BefatorDistanceCalculator;
import de.bommels05.befatorweb.links.calc.DistanceCalculator;
import de.bommels05.befatorweb.links.calc.V2DistanceCalculator;
import de.bommels05.dblib.core.DBEntry;
import de.bommels05.dblib.core.Database;
import de.bommels05.dblib.core.QueryHolder;
import de.bommels05.dblib.core.Table;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.net.URI;
import java.net.URISyntaxException;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.Stream;
public class LinkDatabase {
private static final Logger LOGGER = LoggerFactory.getLogger(LinkDatabase.class);
private static final List<DistanceCalculator> calculators = new ArrayList<>();
public static final Database LINKS_DB = new Database("links");
static {
LINKS_DB.registerTable("links", "source String", "destination String");
//calculators.add(new BefatorDistanceCalculator());
calculators.add(new V2DistanceCalculator());
}
public static final Table<LinkDBEntry, Integer> LINKS_TABLE = LINKS_DB.getTable("links", LinkDBEntry::new);
public static void addLink(String source, String destination) {
if (destination.startsWith("#")) {
return;
}
if (source.startsWith("/proxy/")) {
source = source.replaceFirst("/proxy/", "");
}
if (destination.startsWith("/proxy/")) {
destination = destination.replaceFirst("/proxy/", "https://");
}
try {
URI base = new URI(source);
URI relative = new URI(destination);
URI result = base.resolve(relative);
String finalDestination = result.toString();
if (finalDestination.startsWith("https://")) {
finalDestination = finalDestination.replaceFirst("https://", "");
}
if (source.equals(finalDestination) || hasDirectLink(source, finalDestination)) {
return;
}
new LinkDBEntry(LINKS_TABLE, source, finalDestination).save();
} catch (URISyntaxException e) {
LOGGER.error("Invalid Link URL", e);
}
}
private static boolean hasDirectLink(String source, String destination) {
return LINKS_TABLE.getAll().stream().filter(link -> link.getSource().equals(source)).anyMatch(link -> link.getDestination().equals(destination));
}
public static int getDistance(String source, String destination) {
if (source.startsWith("/proxy/")) {
source = source.replaceFirst("/proxy/", "");
}
source = BefatorRewriter.stripProtocol(source);
/*if (destination.endsWith("/")) {
destination = destination.substring(0, destination.length() - 1);
}*/
for (DistanceCalculator calculator : calculators) {
/*LOGGER.info("Trying {}", calculator.getName());
long millis = System.currentTimeMillis();
int distance = calculator.getDistance(source, destination, LinkDatabase::getLinks);
LOGGER.info("Distance took {}ms ({})", System.currentTimeMillis() - millis, distance);*/
long millis = System.currentTimeMillis();
List<String> path = calculator.getPath(source, destination, LinkDatabase::getLinks);
StringJoiner joiner = new StringJoiner(" <- ");
if (path != null) {
path.forEach(joiner::add);
} else {
joiner.add("null");
}
LOGGER.info("Path took {}ms ({})", System.currentTimeMillis() - millis, joiner);
if (path != null) {
return path.size() - 1;
} else {
return -1;
}
}
return calculators.get(0).getDistance(source, destination, LinkDatabase::getLinks);
/*if (USE_V2) {
return getDistance2(source, destination);
} else {*/
/*List<String> path = new BefatorDistanceCalculator.getPath(source, destination, LinkDatabase::getLinks);
if (path != null) {
System.out.print(destination);
for (String link : path) {
System.out.print(" <- " + link);
}
System.out.println();
return path.size();
} else {
return -1;
}*/
/*}*/
}
public static Stream<String> getLinks(String source) {
try {
QueryHolder result = LINKS_DB.executeQuery("SELECT * FROM links WHERE source = ?", source);
List<String> links = new ArrayList<>();
while(result.getResultSet().next()) {
links.add(new LinkDBEntry(LINKS_TABLE,result).getDestination());
}
result.close();
return links.stream();
} catch (SQLException e) {
throw new RuntimeException(e);
}
//return LINKS_TABLE.getAll().stream().filter(link -> link.getSource().equals(source)).map(LinkDBEntry::getDestination);
}
}

View File

@ -0,0 +1,60 @@
package de.bommels05.befatorweb.links.calc;
import de.bommels05.befatorweb.links.LinkDBEntry;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Stream;
public class BefatorDistanceCalculator implements DistanceCalculator {
@Override
public int getDistance(String source, String destination, Function<String, Stream<String>> linkGetter) {
List<String> path = getPath(source, destination, linkGetter);
if (path != null) {
return path.size() - 1;
} else {
return -1;
}
}
@Override
public List<String> getPath(String source, String destination, Function<String, Stream<String>> linkGetter) {
List<String> path = getPath(source, destination, new ArrayList<>(), linkGetter);
if (path != null) {
path.add(0, destination);
}
return path;
}
private static List<String> getPath(String source, String destination, List<String> blacklist, Function<String, Stream<String>> linkGetter) {
if (source.equals(destination)) {
return List.of();
}
List<String> outgoing = linkGetter.apply(source).filter(link -> !blacklist.contains(link)).toList();
if (outgoing.isEmpty()) {
return null;
}
Optional<String> finalLink = outgoing.stream().filter(link -> link.equals(destination)).findFirst();
if (finalLink.isPresent()) {
return List.of(source);
} else {
blacklist.addAll(outgoing);
Optional<List<String>> shortest = outgoing.stream().map(link -> getPath(link, destination, new ArrayList<>(blacklist), linkGetter)).filter(Objects::nonNull).min(Comparator.comparingInt(List::size));
if (shortest.isPresent()) {
ArrayList<String> path = new ArrayList<>(shortest.get());
path.add(source);
return path;
} else {
return null;
}
}
}
@Override
public String getName() {
return "Befator";
}
}

View File

@ -0,0 +1,15 @@
package de.bommels05.befatorweb.links.calc;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Stream;
public interface DistanceCalculator {
public int getDistance(String source, String destination, Function<String, Stream<String>> linkGetter);
public List<String> getPath(String source, String destination, Function<String, Stream<String>> linkGetter);
public String getName();
}

View File

@ -0,0 +1,72 @@
package de.bommels05.befatorweb.links.calc;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Stream;
public class V2DistanceCalculator implements DistanceCalculator {
@Override
public int getDistance(String source, String destination, Function<String, Stream<String>> linkGetter) {
Deque<String> queue = new ArrayDeque<>();
queue.add(source);
Map<String, Integer> distances = new HashMap<>();
distances.put(source, 0);
while (!queue.isEmpty()) {
String target = queue.poll();
for (String link : linkGetter.apply(target).toList()) {
if (!distances.containsKey(link)) {
distances.put(link, distances.get(target) + 1);
queue.add(link);
}
if (link.equals(destination)) {
break;
}
}
}
return distances.getOrDefault(destination, -1);
}
@Override
public List<String> getPath(String source, String destination, Function<String, Stream<String>> linkGetter) {
Deque<String> queue = new ArrayDeque<>();
queue.add(source);
Map<String, String> sources = new HashMap<>();
sources.put(source, null);
while (!queue.isEmpty()) {
String target = queue.poll();
for (String link : linkGetter.apply(target).toList()) {
if (!sources.containsKey(link)) {
sources.put(link, target);
queue.add(link);
}
if (link.equals(destination)) {
break;
}
}
}
List<String> path = new ArrayList<>();
String current = destination;
while (current != null) {
path.add(current);
current = sources.get(current);
}
if (path.get(path.size() - 1).equals(source)) {
return path;
} else {
return null;
}
}
@Override
public String getName() {
return "V2";
}
}

View File

@ -0,0 +1,3 @@
Manifest-Version: 1.0
Main-Class: de.bommels05.befatorweb.BefatorWeb