Fix indexing, upgrade MeiliSearch sdk (#100)
* Hotfixes for indexing * Handles missing INDEX_CACHE_PATH environment variable * Exits on startup if environment variables are missing. The flag --allow-missing-vars disables this, but that is generally a bad idea, since most environment variables are required (and the ones that aren't should be marked as such). * Disables the query loggers * Upgrade meilisearch-sdk to 0.4.0 for MeiliSearch 0.16 support * Fix swap of Forge and Fabric labeling
This commit is contained in:
parent
d477874535
commit
c8e58a1e5b
11
Cargo.lock
generated
11
Cargo.lock
generated
@ -1726,15 +1726,14 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-sdk"
|
||||
version = "0.3.0"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3a9e61da1ebd3d15e0aaa978d3f1f080e3793494ddea0bc6703da4b330ea1ffc"
|
||||
checksum = "cb2081610089deb10290747b8782049f9cb64a70a4d305a28970db8b780d1448"
|
||||
dependencies = [
|
||||
"log",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"urlencoding",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"web-sys",
|
||||
@ -3009,12 +3008,6 @@ dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "urlencoding"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c9232eb53352b4442e40d7900465dfc534e8cb2dc8f18656fcb2ac16112b5593"
|
||||
|
||||
[[package]]
|
||||
name = "v_escape"
|
||||
version = "0.13.2"
|
||||
|
||||
@ -18,7 +18,7 @@ actix-files = "0.4.0"
|
||||
actix-multipart = "0.3.0"
|
||||
actix-cors = "0.4.1"
|
||||
|
||||
meilisearch-sdk = "0.3.0"
|
||||
meilisearch-sdk = "0.4.0"
|
||||
reqwest = { version = "0.10.8", features = ["json"] }
|
||||
|
||||
serde_json = "1.0"
|
||||
|
||||
80
src/main.rs
80
src/main.rs
@ -1,11 +1,10 @@
|
||||
use crate::file_hosting::S3Host;
|
||||
use actix_cors::Cors;
|
||||
use actix_ratelimit::{MemoryStore, MemoryStoreActor, RateLimiter};
|
||||
use actix_web::middleware::Logger;
|
||||
use actix_web::{http, web, App, HttpServer};
|
||||
use env_logger::Env;
|
||||
use gumdrop::Options;
|
||||
use log::{info, warn};
|
||||
use log::{error, info, warn};
|
||||
use search::indexing::index_mods;
|
||||
use search::indexing::IndexingSettings;
|
||||
use std::sync::Arc;
|
||||
@ -29,6 +28,12 @@ struct Config {
|
||||
reconfigure_indices: bool,
|
||||
#[options(no_short, help = "Reset the documents in the indices")]
|
||||
reset_indices: bool,
|
||||
|
||||
#[options(
|
||||
no_short,
|
||||
help = "Allow missing environment variables on startup. This is a bad idea, but it may work in some cases."
|
||||
)]
|
||||
allow_missing_vars: bool,
|
||||
}
|
||||
|
||||
#[actix_rt::main]
|
||||
@ -38,7 +43,15 @@ async fn main() -> std::io::Result<()> {
|
||||
|
||||
let config = Config::parse_args_default_or_exit();
|
||||
|
||||
check_env_vars();
|
||||
if check_env_vars() {
|
||||
error!("Some environment variables are missing!");
|
||||
if !config.allow_missing_vars {
|
||||
return Err(std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
"Missing required environment variables",
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let search_config = search::SearchConfig {
|
||||
address: dotenv::var("MEILISEARCH_ADDR").unwrap(),
|
||||
@ -225,8 +238,6 @@ async fn main() -> std::io::Result<()> {
|
||||
|
||||
App::new()
|
||||
.wrap(cors.finish())
|
||||
.wrap(Logger::default())
|
||||
.wrap(Logger::new("%a %{User-Agent}i"))
|
||||
.wrap(
|
||||
RateLimiter::new(MemoryStoreActor::from(store.clone()).start())
|
||||
.with_interval(std::time::Duration::from_secs(60))
|
||||
@ -254,8 +265,10 @@ async fn main() -> std::io::Result<()> {
|
||||
}
|
||||
|
||||
// This is so that env vars not used immediately don't panic at runtime
|
||||
fn check_env_vars() {
|
||||
fn check_var<T: std::str::FromStr>(var: &str) {
|
||||
fn check_env_vars() -> bool {
|
||||
let mut failed = false;
|
||||
|
||||
fn check_var<T: std::str::FromStr>(var: &str) -> bool {
|
||||
if dotenv::var(var)
|
||||
.ok()
|
||||
.and_then(|s| s.parse::<T>().ok())
|
||||
@ -265,7 +278,10 @@ fn check_env_vars() {
|
||||
"Variable `{}` missing in dotenv or not of type `{}`",
|
||||
var,
|
||||
std::any::type_name::<T>()
|
||||
)
|
||||
);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
@ -275,51 +291,55 @@ fn check_env_vars() {
|
||||
.is_none()
|
||||
{
|
||||
warn!("Variable `CORS_ORIGINS` missing in dotenv or not a json array of strings");
|
||||
failed |= true;
|
||||
}
|
||||
|
||||
check_var::<String>("CDN_URL");
|
||||
check_var::<String>("DATABASE_URL");
|
||||
check_var::<String>("MEILISEARCH_ADDR");
|
||||
check_var::<String>("MEILISEARCH_KEY");
|
||||
check_var::<String>("BIND_ADDR");
|
||||
failed |= check_var::<String>("CDN_URL");
|
||||
failed |= check_var::<String>("DATABASE_URL");
|
||||
failed |= check_var::<String>("MEILISEARCH_ADDR");
|
||||
failed |= check_var::<String>("MEILISEARCH_KEY");
|
||||
failed |= check_var::<String>("BIND_ADDR");
|
||||
|
||||
check_var::<String>("STORAGE_BACKEND");
|
||||
failed |= check_var::<String>("STORAGE_BACKEND");
|
||||
|
||||
let storage_backend = dotenv::var("STORAGE_BACKEND").ok();
|
||||
|
||||
if storage_backend.as_deref() == Some("backblaze") {
|
||||
check_var::<String>("BACKBLAZE_KEY_ID");
|
||||
check_var::<String>("BACKBLAZE_KEY");
|
||||
check_var::<String>("BACKBLAZE_BUCKET_ID");
|
||||
failed |= check_var::<String>("BACKBLAZE_KEY_ID");
|
||||
failed |= check_var::<String>("BACKBLAZE_KEY");
|
||||
failed |= check_var::<String>("BACKBLAZE_BUCKET_ID");
|
||||
} else if storage_backend.as_deref() == Some("s3") {
|
||||
check_var::<String>("S3_ACCESS_TOKEN");
|
||||
check_var::<String>("S3_SECRET");
|
||||
check_var::<String>("S3_URL");
|
||||
check_var::<String>("S3_REGION");
|
||||
check_var::<String>("S3_BUCKET_NAME");
|
||||
failed |= check_var::<String>("S3_ACCESS_TOKEN");
|
||||
failed |= check_var::<String>("S3_SECRET");
|
||||
failed |= check_var::<String>("S3_URL");
|
||||
failed |= check_var::<String>("S3_REGION");
|
||||
failed |= check_var::<String>("S3_BUCKET_NAME");
|
||||
} else if storage_backend.as_deref() == Some("local") {
|
||||
check_var::<String>("MOCK_FILE_PATH");
|
||||
failed |= check_var::<String>("MOCK_FILE_PATH");
|
||||
} else if let Some(backend) = storage_backend {
|
||||
warn!("Variable `STORAGE_BACKEND` contains an invalid value: {}. Expected \"backblaze\", \"s3\", or \"local\".", backend);
|
||||
failed |= true;
|
||||
}
|
||||
|
||||
check_var::<bool>("INDEX_CURSEFORGE");
|
||||
failed |= check_var::<bool>("INDEX_CURSEFORGE");
|
||||
if dotenv::var("INDEX_CURSEFORGE")
|
||||
.ok()
|
||||
.and_then(|s| s.parse::<bool>().ok())
|
||||
.unwrap_or(false)
|
||||
{
|
||||
check_var::<usize>("EXTERNAL_INDEX_INTERVAL");
|
||||
check_var::<usize>("MAX_CURSEFORGE_ID");
|
||||
failed |= check_var::<usize>("EXTERNAL_INDEX_INTERVAL");
|
||||
failed |= check_var::<usize>("MAX_CURSEFORGE_ID");
|
||||
}
|
||||
|
||||
check_var::<usize>("LOCAL_INDEX_INTERVAL");
|
||||
failed |= check_var::<usize>("LOCAL_INDEX_INTERVAL");
|
||||
|
||||
// In theory this should be an OsString since it's a path, but
|
||||
// dotenv doesn't support that. The usage of this does treat
|
||||
// it as an OsString, though.
|
||||
check_var::<String>("INDEX_CACHE_PATH");
|
||||
failed |= check_var::<String>("INDEX_CACHE_PATH");
|
||||
|
||||
check_var::<String>("GITHUB_CLIENT_ID");
|
||||
check_var::<String>("GITHUB_CLIENT_SECRET");
|
||||
failed |= check_var::<String>("GITHUB_CLIENT_ID");
|
||||
failed |= check_var::<String>("GITHUB_CLIENT_SECRET");
|
||||
|
||||
failed
|
||||
}
|
||||
|
||||
@ -119,21 +119,23 @@ lazy_static::lazy_static! {
|
||||
pub async fn index_curseforge(
|
||||
start_index: u32,
|
||||
end_index: u32,
|
||||
cache_path: &std::path::Path,
|
||||
cache_path: Option<&std::path::Path>,
|
||||
) -> Result<Vec<UploadSearchMod>, IndexingError> {
|
||||
info!("Indexing curseforge mods!");
|
||||
let start = std::time::Instant::now();
|
||||
|
||||
let mut docs_to_add: Vec<UploadSearchMod> = vec![];
|
||||
|
||||
let cache = std::fs::File::open(cache_path)
|
||||
let cache = cache_path
|
||||
.map(std::fs::File::open)
|
||||
.and_then(Result::ok)
|
||||
.map(std::io::BufReader::new)
|
||||
.map(serde_json::from_reader::<_, Vec<u32>>);
|
||||
|
||||
let requested_ids;
|
||||
|
||||
// This caching system can't handle segmented indexing
|
||||
if let Ok(Ok(mut cache)) = cache {
|
||||
if let Some(Ok(mut cache)) = cache {
|
||||
let end = cache.last().copied().unwrap_or(start_index);
|
||||
cache.extend(end..end_index);
|
||||
requested_ids = serde_json::to_string(&cache)?;
|
||||
@ -167,13 +169,15 @@ pub async fn index_curseforge(
|
||||
// Only write to the cache if this doesn't skip mods at the start
|
||||
// The caching system iterates through all ids normally past the last
|
||||
// id in the cache, so the end_index shouldn't matter.
|
||||
if let Some(path) = cache_path {
|
||||
if start_index <= 1 {
|
||||
let mut ids = curseforge_mods.iter().map(|m| m.id).collect::<Vec<_>>();
|
||||
ids.sort_unstable();
|
||||
if let Err(e) = std::fs::write(cache_path, serde_json::to_string(&ids)?) {
|
||||
if let Err(e) = std::fs::write(path, serde_json::to_string(&ids)?) {
|
||||
log::warn!("Error writing to index id cache: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for mut curseforge_mod in curseforge_mods {
|
||||
// The gameId of minecraft is 432
|
||||
@ -192,8 +196,8 @@ pub async fn index_curseforge(
|
||||
for file in curseforge_mod.latest_files {
|
||||
for version in file.game_version {
|
||||
match &*version {
|
||||
"Fabric" => loaders.forge = true,
|
||||
"Forge" => loaders.fabric = true,
|
||||
"Fabric" => loaders.fabric = true,
|
||||
"Forge" => loaders.forge = true,
|
||||
"Rift" => loaders.rift = true,
|
||||
_ => (),
|
||||
}
|
||||
@ -309,7 +313,6 @@ pub async fn index_curseforge(
|
||||
modified_timestamp: curseforge_mod.date_modified.timestamp(),
|
||||
latest_version,
|
||||
host: Cow::Borrowed("curseforge"),
|
||||
empty: Cow::Borrowed("{}{}{}"),
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@ -112,7 +112,6 @@ pub async fn index_local(pool: PgPool) -> Result<Vec<UploadSearchMod>, IndexingE
|
||||
modified_timestamp: mod_data.updated.timestamp(),
|
||||
latest_version,
|
||||
host: Cow::Borrowed("modrinth"),
|
||||
empty: Cow::Borrowed("{}{}{}"),
|
||||
});
|
||||
}
|
||||
}
|
||||
@ -225,6 +224,5 @@ pub async fn query_one(
|
||||
modified_timestamp: mod_data.updated.timestamp(),
|
||||
latest_version,
|
||||
host: Cow::Borrowed("modrinth"),
|
||||
empty: Cow::Borrowed("{}{}{}"),
|
||||
})
|
||||
}
|
||||
|
||||
@ -63,7 +63,7 @@ pub async fn index_mods(
|
||||
) -> Result<(), IndexingError> {
|
||||
let mut docs_to_add: Vec<UploadSearchMod> = vec![];
|
||||
|
||||
let cache_path = std::path::PathBuf::from(std::env::var_os("INDEX_CACHE_PATH").unwrap());
|
||||
let cache_path = std::env::var_os("INDEX_CACHE_PATH").map(std::path::PathBuf::from);
|
||||
|
||||
if settings.index_local {
|
||||
docs_to_add.append(&mut index_local(pool.clone()).await?);
|
||||
@ -74,7 +74,7 @@ pub async fn index_mods(
|
||||
.map(|i| i.parse().unwrap())
|
||||
.unwrap_or(450_000);
|
||||
|
||||
docs_to_add.append(&mut index_curseforge(1, end_index, &cache_path).await?);
|
||||
docs_to_add.append(&mut index_curseforge(1, end_index, cache_path.as_deref()).await?);
|
||||
}
|
||||
|
||||
// Write Indices
|
||||
@ -270,7 +270,6 @@ fn default_settings() -> Settings {
|
||||
"categories".to_string(),
|
||||
"versions".to_string(),
|
||||
"author".to_string(),
|
||||
"empty".to_string(),
|
||||
];
|
||||
|
||||
Settings::new()
|
||||
|
||||
@ -5,7 +5,6 @@ use actix_web::web::HttpResponse;
|
||||
use chrono::{DateTime, Utc};
|
||||
use meilisearch_sdk::client::Client;
|
||||
use meilisearch_sdk::document::Document;
|
||||
use meilisearch_sdk::search::Query;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::borrow::Cow;
|
||||
use std::cmp::min;
|
||||
@ -84,12 +83,6 @@ pub struct UploadSearchMod {
|
||||
pub modified_timestamp: i64,
|
||||
|
||||
pub host: Cow<'static, str>,
|
||||
|
||||
/// Must be "{}{}{}", a hack until meilisearch supports searches
|
||||
/// with empty queries (https://github.com/meilisearch/MeiliSearch/issues/729)
|
||||
// This is a Cow to prevent unnecessary allocations for a static
|
||||
// string
|
||||
pub empty: Cow<'static, str>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug)]
|
||||
@ -155,23 +148,6 @@ pub async fn search_for_mod(
|
||||
let offset = info.offset.as_deref().unwrap_or("0").parse()?;
|
||||
let index = info.index.as_deref().unwrap_or("relevance");
|
||||
let limit = info.limit.as_deref().unwrap_or("10").parse()?;
|
||||
let search_query: &str = info
|
||||
.query
|
||||
.as_deref()
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or("{}{}{}");
|
||||
|
||||
let mut query = Query::new(search_query)
|
||||
.with_limit(min(100, limit))
|
||||
.with_offset(offset);
|
||||
|
||||
if !filters.is_empty() {
|
||||
query = query.with_filters(&filters);
|
||||
}
|
||||
if let Some(facets) = &info.facets {
|
||||
let facets = serde_json::from_str::<Vec<Vec<&str>>>(facets)?;
|
||||
query = query.with_facet_filters(facets);
|
||||
}
|
||||
|
||||
let index = match index {
|
||||
"relevance" => "relevance_mods",
|
||||
@ -181,14 +157,44 @@ pub async fn search_for_mod(
|
||||
i => return Err(SearchError::InvalidIndex(i.to_string())),
|
||||
};
|
||||
|
||||
let results = client
|
||||
.get_index(index)
|
||||
.await?
|
||||
.search::<ResultSearchMod>(&query)
|
||||
.await?;
|
||||
let meilisearch_index = client.get_index(index).await?;
|
||||
let mut query = meilisearch_index.search();
|
||||
|
||||
query.with_limit(min(100, limit)).with_offset(offset);
|
||||
|
||||
if let Some(search) = info.query.as_deref() {
|
||||
if !search.is_empty() {
|
||||
query.with_query(search);
|
||||
}
|
||||
}
|
||||
|
||||
if !filters.is_empty() {
|
||||
query.with_filters(&filters);
|
||||
}
|
||||
|
||||
// So the meilisearch sdk's lifetimes are... broken, to say the least
|
||||
// They are overspecified and almost always wrong, and would generally
|
||||
// just be better if they didn't specify them at all.
|
||||
|
||||
// They also decided to have this take a &[&[&str]], which is impossible
|
||||
// to construct efficiently. Instead it should take impl Iterator<Item=&[&str]>,
|
||||
// &[impl AsRef<[&str]>], or one of many other proper solutions to that issue.
|
||||
|
||||
let why_meilisearch;
|
||||
let why_must_you_do_this;
|
||||
if let Some(facets) = &info.facets {
|
||||
why_meilisearch = serde_json::from_str::<Vec<Vec<&str>>>(facets)?;
|
||||
why_must_you_do_this = why_meilisearch
|
||||
.iter()
|
||||
.map(|v| v as &[_])
|
||||
.collect::<Vec<&[_]>>();
|
||||
query.with_facet_filters(&why_must_you_do_this);
|
||||
}
|
||||
|
||||
let results = query.execute::<ResultSearchMod>().await?;
|
||||
|
||||
Ok(SearchResults {
|
||||
hits: results.hits,
|
||||
hits: results.hits.into_iter().map(|r| r.result).collect(),
|
||||
offset: results.offset,
|
||||
limit: results.limit,
|
||||
total_hits: results.nb_hits,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user