Fix indexing, upgrade MeiliSearch sdk (#100)

* Hotfixes for indexing

* Handles missing INDEX_CACHE_PATH environment variable
* Exits on startup if environment variables are missing. The flag
  --allow-missing-vars disables this, but that is generally a bad
  idea, since most environment variables are required (and the ones
  that aren't should be marked as such).
* Disables the query loggers

* Upgrade meilisearch-sdk to 0.4.0 for MeiliSearch 0.16 support

* Fix swap of Forge and Fabric labeling
This commit is contained in:
Aeledfyr 2020-11-05 09:38:03 -06:00 committed by GitHub
parent d477874535
commit c8e58a1e5b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 105 additions and 86 deletions

11
Cargo.lock generated
View File

@ -1726,15 +1726,14 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]] [[package]]
name = "meilisearch-sdk" name = "meilisearch-sdk"
version = "0.3.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a9e61da1ebd3d15e0aaa978d3f1f080e3793494ddea0bc6703da4b330ea1ffc" checksum = "cb2081610089deb10290747b8782049f9cb64a70a4d305a28970db8b780d1448"
dependencies = [ dependencies = [
"log", "log",
"reqwest", "reqwest",
"serde", "serde",
"serde_json", "serde_json",
"urlencoding",
"wasm-bindgen", "wasm-bindgen",
"wasm-bindgen-futures", "wasm-bindgen-futures",
"web-sys", "web-sys",
@ -3009,12 +3008,6 @@ dependencies = [
"percent-encoding", "percent-encoding",
] ]
[[package]]
name = "urlencoding"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9232eb53352b4442e40d7900465dfc534e8cb2dc8f18656fcb2ac16112b5593"
[[package]] [[package]]
name = "v_escape" name = "v_escape"
version = "0.13.2" version = "0.13.2"

View File

@ -18,7 +18,7 @@ actix-files = "0.4.0"
actix-multipart = "0.3.0" actix-multipart = "0.3.0"
actix-cors = "0.4.1" actix-cors = "0.4.1"
meilisearch-sdk = "0.3.0" meilisearch-sdk = "0.4.0"
reqwest = { version = "0.10.8", features = ["json"] } reqwest = { version = "0.10.8", features = ["json"] }
serde_json = "1.0" serde_json = "1.0"

View File

@ -1,11 +1,10 @@
use crate::file_hosting::S3Host; use crate::file_hosting::S3Host;
use actix_cors::Cors; use actix_cors::Cors;
use actix_ratelimit::{MemoryStore, MemoryStoreActor, RateLimiter}; use actix_ratelimit::{MemoryStore, MemoryStoreActor, RateLimiter};
use actix_web::middleware::Logger;
use actix_web::{http, web, App, HttpServer}; use actix_web::{http, web, App, HttpServer};
use env_logger::Env; use env_logger::Env;
use gumdrop::Options; use gumdrop::Options;
use log::{info, warn}; use log::{error, info, warn};
use search::indexing::index_mods; use search::indexing::index_mods;
use search::indexing::IndexingSettings; use search::indexing::IndexingSettings;
use std::sync::Arc; use std::sync::Arc;
@ -29,6 +28,12 @@ struct Config {
reconfigure_indices: bool, reconfigure_indices: bool,
#[options(no_short, help = "Reset the documents in the indices")] #[options(no_short, help = "Reset the documents in the indices")]
reset_indices: bool, reset_indices: bool,
#[options(
no_short,
help = "Allow missing environment variables on startup. This is a bad idea, but it may work in some cases."
)]
allow_missing_vars: bool,
} }
#[actix_rt::main] #[actix_rt::main]
@ -38,7 +43,15 @@ async fn main() -> std::io::Result<()> {
let config = Config::parse_args_default_or_exit(); let config = Config::parse_args_default_or_exit();
check_env_vars(); if check_env_vars() {
error!("Some environment variables are missing!");
if !config.allow_missing_vars {
return Err(std::io::Error::new(
std::io::ErrorKind::Other,
"Missing required environment variables",
));
}
}
let search_config = search::SearchConfig { let search_config = search::SearchConfig {
address: dotenv::var("MEILISEARCH_ADDR").unwrap(), address: dotenv::var("MEILISEARCH_ADDR").unwrap(),
@ -225,8 +238,6 @@ async fn main() -> std::io::Result<()> {
App::new() App::new()
.wrap(cors.finish()) .wrap(cors.finish())
.wrap(Logger::default())
.wrap(Logger::new("%a %{User-Agent}i"))
.wrap( .wrap(
RateLimiter::new(MemoryStoreActor::from(store.clone()).start()) RateLimiter::new(MemoryStoreActor::from(store.clone()).start())
.with_interval(std::time::Duration::from_secs(60)) .with_interval(std::time::Duration::from_secs(60))
@ -254,8 +265,10 @@ async fn main() -> std::io::Result<()> {
} }
// This is so that env vars not used immediately don't panic at runtime // This is so that env vars not used immediately don't panic at runtime
fn check_env_vars() { fn check_env_vars() -> bool {
fn check_var<T: std::str::FromStr>(var: &str) { let mut failed = false;
fn check_var<T: std::str::FromStr>(var: &str) -> bool {
if dotenv::var(var) if dotenv::var(var)
.ok() .ok()
.and_then(|s| s.parse::<T>().ok()) .and_then(|s| s.parse::<T>().ok())
@ -265,7 +278,10 @@ fn check_env_vars() {
"Variable `{}` missing in dotenv or not of type `{}`", "Variable `{}` missing in dotenv or not of type `{}`",
var, var,
std::any::type_name::<T>() std::any::type_name::<T>()
) );
true
} else {
false
} }
} }
@ -275,51 +291,55 @@ fn check_env_vars() {
.is_none() .is_none()
{ {
warn!("Variable `CORS_ORIGINS` missing in dotenv or not a json array of strings"); warn!("Variable `CORS_ORIGINS` missing in dotenv or not a json array of strings");
failed |= true;
} }
check_var::<String>("CDN_URL"); failed |= check_var::<String>("CDN_URL");
check_var::<String>("DATABASE_URL"); failed |= check_var::<String>("DATABASE_URL");
check_var::<String>("MEILISEARCH_ADDR"); failed |= check_var::<String>("MEILISEARCH_ADDR");
check_var::<String>("MEILISEARCH_KEY"); failed |= check_var::<String>("MEILISEARCH_KEY");
check_var::<String>("BIND_ADDR"); failed |= check_var::<String>("BIND_ADDR");
check_var::<String>("STORAGE_BACKEND"); failed |= check_var::<String>("STORAGE_BACKEND");
let storage_backend = dotenv::var("STORAGE_BACKEND").ok(); let storage_backend = dotenv::var("STORAGE_BACKEND").ok();
if storage_backend.as_deref() == Some("backblaze") { if storage_backend.as_deref() == Some("backblaze") {
check_var::<String>("BACKBLAZE_KEY_ID"); failed |= check_var::<String>("BACKBLAZE_KEY_ID");
check_var::<String>("BACKBLAZE_KEY"); failed |= check_var::<String>("BACKBLAZE_KEY");
check_var::<String>("BACKBLAZE_BUCKET_ID"); failed |= check_var::<String>("BACKBLAZE_BUCKET_ID");
} else if storage_backend.as_deref() == Some("s3") { } else if storage_backend.as_deref() == Some("s3") {
check_var::<String>("S3_ACCESS_TOKEN"); failed |= check_var::<String>("S3_ACCESS_TOKEN");
check_var::<String>("S3_SECRET"); failed |= check_var::<String>("S3_SECRET");
check_var::<String>("S3_URL"); failed |= check_var::<String>("S3_URL");
check_var::<String>("S3_REGION"); failed |= check_var::<String>("S3_REGION");
check_var::<String>("S3_BUCKET_NAME"); failed |= check_var::<String>("S3_BUCKET_NAME");
} else if storage_backend.as_deref() == Some("local") { } else if storage_backend.as_deref() == Some("local") {
check_var::<String>("MOCK_FILE_PATH"); failed |= check_var::<String>("MOCK_FILE_PATH");
} else if let Some(backend) = storage_backend { } else if let Some(backend) = storage_backend {
warn!("Variable `STORAGE_BACKEND` contains an invalid value: {}. Expected \"backblaze\", \"s3\", or \"local\".", backend); warn!("Variable `STORAGE_BACKEND` contains an invalid value: {}. Expected \"backblaze\", \"s3\", or \"local\".", backend);
failed |= true;
} }
check_var::<bool>("INDEX_CURSEFORGE"); failed |= check_var::<bool>("INDEX_CURSEFORGE");
if dotenv::var("INDEX_CURSEFORGE") if dotenv::var("INDEX_CURSEFORGE")
.ok() .ok()
.and_then(|s| s.parse::<bool>().ok()) .and_then(|s| s.parse::<bool>().ok())
.unwrap_or(false) .unwrap_or(false)
{ {
check_var::<usize>("EXTERNAL_INDEX_INTERVAL"); failed |= check_var::<usize>("EXTERNAL_INDEX_INTERVAL");
check_var::<usize>("MAX_CURSEFORGE_ID"); failed |= check_var::<usize>("MAX_CURSEFORGE_ID");
} }
check_var::<usize>("LOCAL_INDEX_INTERVAL"); failed |= check_var::<usize>("LOCAL_INDEX_INTERVAL");
// In theory this should be an OsString since it's a path, but // In theory this should be an OsString since it's a path, but
// dotenv doesn't support that. The usage of this does treat // dotenv doesn't support that. The usage of this does treat
// it as an OsString, though. // it as an OsString, though.
check_var::<String>("INDEX_CACHE_PATH"); failed |= check_var::<String>("INDEX_CACHE_PATH");
check_var::<String>("GITHUB_CLIENT_ID"); failed |= check_var::<String>("GITHUB_CLIENT_ID");
check_var::<String>("GITHUB_CLIENT_SECRET"); failed |= check_var::<String>("GITHUB_CLIENT_SECRET");
failed
} }

View File

@ -119,21 +119,23 @@ lazy_static::lazy_static! {
pub async fn index_curseforge( pub async fn index_curseforge(
start_index: u32, start_index: u32,
end_index: u32, end_index: u32,
cache_path: &std::path::Path, cache_path: Option<&std::path::Path>,
) -> Result<Vec<UploadSearchMod>, IndexingError> { ) -> Result<Vec<UploadSearchMod>, IndexingError> {
info!("Indexing curseforge mods!"); info!("Indexing curseforge mods!");
let start = std::time::Instant::now(); let start = std::time::Instant::now();
let mut docs_to_add: Vec<UploadSearchMod> = vec![]; let mut docs_to_add: Vec<UploadSearchMod> = vec![];
let cache = std::fs::File::open(cache_path) let cache = cache_path
.map(std::fs::File::open)
.and_then(Result::ok)
.map(std::io::BufReader::new) .map(std::io::BufReader::new)
.map(serde_json::from_reader::<_, Vec<u32>>); .map(serde_json::from_reader::<_, Vec<u32>>);
let requested_ids; let requested_ids;
// This caching system can't handle segmented indexing // This caching system can't handle segmented indexing
if let Ok(Ok(mut cache)) = cache { if let Some(Ok(mut cache)) = cache {
let end = cache.last().copied().unwrap_or(start_index); let end = cache.last().copied().unwrap_or(start_index);
cache.extend(end..end_index); cache.extend(end..end_index);
requested_ids = serde_json::to_string(&cache)?; requested_ids = serde_json::to_string(&cache)?;
@ -167,11 +169,13 @@ pub async fn index_curseforge(
// Only write to the cache if this doesn't skip mods at the start // Only write to the cache if this doesn't skip mods at the start
// The caching system iterates through all ids normally past the last // The caching system iterates through all ids normally past the last
// id in the cache, so the end_index shouldn't matter. // id in the cache, so the end_index shouldn't matter.
if start_index <= 1 { if let Some(path) = cache_path {
let mut ids = curseforge_mods.iter().map(|m| m.id).collect::<Vec<_>>(); if start_index <= 1 {
ids.sort_unstable(); let mut ids = curseforge_mods.iter().map(|m| m.id).collect::<Vec<_>>();
if let Err(e) = std::fs::write(cache_path, serde_json::to_string(&ids)?) { ids.sort_unstable();
log::warn!("Error writing to index id cache: {}", e); if let Err(e) = std::fs::write(path, serde_json::to_string(&ids)?) {
log::warn!("Error writing to index id cache: {}", e);
}
} }
} }
@ -192,8 +196,8 @@ pub async fn index_curseforge(
for file in curseforge_mod.latest_files { for file in curseforge_mod.latest_files {
for version in file.game_version { for version in file.game_version {
match &*version { match &*version {
"Fabric" => loaders.forge = true, "Fabric" => loaders.fabric = true,
"Forge" => loaders.fabric = true, "Forge" => loaders.forge = true,
"Rift" => loaders.rift = true, "Rift" => loaders.rift = true,
_ => (), _ => (),
} }
@ -309,7 +313,6 @@ pub async fn index_curseforge(
modified_timestamp: curseforge_mod.date_modified.timestamp(), modified_timestamp: curseforge_mod.date_modified.timestamp(),
latest_version, latest_version,
host: Cow::Borrowed("curseforge"), host: Cow::Borrowed("curseforge"),
empty: Cow::Borrowed("{}{}{}"),
}) })
} }

View File

@ -112,7 +112,6 @@ pub async fn index_local(pool: PgPool) -> Result<Vec<UploadSearchMod>, IndexingE
modified_timestamp: mod_data.updated.timestamp(), modified_timestamp: mod_data.updated.timestamp(),
latest_version, latest_version,
host: Cow::Borrowed("modrinth"), host: Cow::Borrowed("modrinth"),
empty: Cow::Borrowed("{}{}{}"),
}); });
} }
} }
@ -225,6 +224,5 @@ pub async fn query_one(
modified_timestamp: mod_data.updated.timestamp(), modified_timestamp: mod_data.updated.timestamp(),
latest_version, latest_version,
host: Cow::Borrowed("modrinth"), host: Cow::Borrowed("modrinth"),
empty: Cow::Borrowed("{}{}{}"),
}) })
} }

View File

@ -63,7 +63,7 @@ pub async fn index_mods(
) -> Result<(), IndexingError> { ) -> Result<(), IndexingError> {
let mut docs_to_add: Vec<UploadSearchMod> = vec![]; let mut docs_to_add: Vec<UploadSearchMod> = vec![];
let cache_path = std::path::PathBuf::from(std::env::var_os("INDEX_CACHE_PATH").unwrap()); let cache_path = std::env::var_os("INDEX_CACHE_PATH").map(std::path::PathBuf::from);
if settings.index_local { if settings.index_local {
docs_to_add.append(&mut index_local(pool.clone()).await?); docs_to_add.append(&mut index_local(pool.clone()).await?);
@ -74,7 +74,7 @@ pub async fn index_mods(
.map(|i| i.parse().unwrap()) .map(|i| i.parse().unwrap())
.unwrap_or(450_000); .unwrap_or(450_000);
docs_to_add.append(&mut index_curseforge(1, end_index, &cache_path).await?); docs_to_add.append(&mut index_curseforge(1, end_index, cache_path.as_deref()).await?);
} }
// Write Indices // Write Indices
@ -270,7 +270,6 @@ fn default_settings() -> Settings {
"categories".to_string(), "categories".to_string(),
"versions".to_string(), "versions".to_string(),
"author".to_string(), "author".to_string(),
"empty".to_string(),
]; ];
Settings::new() Settings::new()

View File

@ -5,7 +5,6 @@ use actix_web::web::HttpResponse;
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use meilisearch_sdk::client::Client; use meilisearch_sdk::client::Client;
use meilisearch_sdk::document::Document; use meilisearch_sdk::document::Document;
use meilisearch_sdk::search::Query;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::borrow::Cow; use std::borrow::Cow;
use std::cmp::min; use std::cmp::min;
@ -84,12 +83,6 @@ pub struct UploadSearchMod {
pub modified_timestamp: i64, pub modified_timestamp: i64,
pub host: Cow<'static, str>, pub host: Cow<'static, str>,
/// Must be "{}{}{}", a hack until meilisearch supports searches
/// with empty queries (https://github.com/meilisearch/MeiliSearch/issues/729)
// This is a Cow to prevent unnecessary allocations for a static
// string
pub empty: Cow<'static, str>,
} }
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
@ -155,23 +148,6 @@ pub async fn search_for_mod(
let offset = info.offset.as_deref().unwrap_or("0").parse()?; let offset = info.offset.as_deref().unwrap_or("0").parse()?;
let index = info.index.as_deref().unwrap_or("relevance"); let index = info.index.as_deref().unwrap_or("relevance");
let limit = info.limit.as_deref().unwrap_or("10").parse()?; let limit = info.limit.as_deref().unwrap_or("10").parse()?;
let search_query: &str = info
.query
.as_deref()
.filter(|s| !s.is_empty())
.unwrap_or("{}{}{}");
let mut query = Query::new(search_query)
.with_limit(min(100, limit))
.with_offset(offset);
if !filters.is_empty() {
query = query.with_filters(&filters);
}
if let Some(facets) = &info.facets {
let facets = serde_json::from_str::<Vec<Vec<&str>>>(facets)?;
query = query.with_facet_filters(facets);
}
let index = match index { let index = match index {
"relevance" => "relevance_mods", "relevance" => "relevance_mods",
@ -181,14 +157,44 @@ pub async fn search_for_mod(
i => return Err(SearchError::InvalidIndex(i.to_string())), i => return Err(SearchError::InvalidIndex(i.to_string())),
}; };
let results = client let meilisearch_index = client.get_index(index).await?;
.get_index(index) let mut query = meilisearch_index.search();
.await?
.search::<ResultSearchMod>(&query) query.with_limit(min(100, limit)).with_offset(offset);
.await?;
if let Some(search) = info.query.as_deref() {
if !search.is_empty() {
query.with_query(search);
}
}
if !filters.is_empty() {
query.with_filters(&filters);
}
// So the meilisearch sdk's lifetimes are... broken, to say the least
// They are overspecified and almost always wrong, and would generally
// just be better if they didn't specify them at all.
// They also decided to have this take a &[&[&str]], which is impossible
// to construct efficiently. Instead it should take impl Iterator<Item=&[&str]>,
// &[impl AsRef<[&str]>], or one of many other proper solutions to that issue.
let why_meilisearch;
let why_must_you_do_this;
if let Some(facets) = &info.facets {
why_meilisearch = serde_json::from_str::<Vec<Vec<&str>>>(facets)?;
why_must_you_do_this = why_meilisearch
.iter()
.map(|v| v as &[_])
.collect::<Vec<&[_]>>();
query.with_facet_filters(&why_must_you_do_this);
}
let results = query.execute::<ResultSearchMod>().await?;
Ok(SearchResults { Ok(SearchResults {
hits: results.hits, hits: results.hits.into_iter().map(|r| r.result).collect(),
offset: results.offset, offset: results.offset,
limit: results.limit, limit: results.limit,
total_hits: results.nb_hits, total_hits: results.nb_hits,