feature: added caching

This commit is contained in:
frosty
2026-03-10 03:40:34 -04:00
parent a11bf8bb6c
commit e33310f263
8 changed files with 185 additions and 18 deletions

View File

@@ -10,7 +10,7 @@ else
LDFLAGS :=
endif
LIBS := -lbeaker -lcurl -lxml2 -lpthread -lm
LIBS := -lbeaker -lcurl -lxml2 -lpthread -lm -lssl -lcrypto
SRC_DIR := src
BIN_DIR := bin

View File

@@ -14,3 +14,13 @@ port = 8000
# Randomize proxy credentials for each request
#randomize_username = true
#randomize_password = true
[cache]
# Directory to store cached responses
#dir = /tmp/omnisearch_cache
# Cache TTL for search results in seconds (default: 3600 = 1 hour)
#ttl_search = 3600
# Cache TTL for infobox data in seconds (default: 86400 = 24 hours)
#ttl_infobox = 86400

View File

@@ -80,6 +80,15 @@ int load_config(const char *filename, Config *config) {
} else if (strcmp(key, "randomize_password") == 0) {
config->randomize_password = atoi(value);
}
} else if (strcmp(section, "cache") == 0) {
if (strcmp(key, "dir") == 0) {
strncpy(config->cache_dir, value, sizeof(config->cache_dir) - 1);
config->cache_dir[sizeof(config->cache_dir) - 1] = '\0';
} else if (strcmp(key, "ttl_search") == 0) {
config->cache_ttl_search = atoi(value);
} else if (strcmp(key, "ttl_infobox") == 0) {
config->cache_ttl_infobox = atoi(value);
}
}
}
}

View File

@@ -9,6 +9,9 @@ typedef struct {
int max_proxy_retries;
int randomize_username;
int randomize_password;
char cache_dir[512];
int cache_ttl_search;
int cache_ttl_infobox;
} Config;
int load_config(const char *filename, Config *config);

View File

@@ -1,4 +1,5 @@
#include "Dictionary.h"
#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
#include "../Scraping/Scraping.h"
#include <ctype.h>
@@ -266,6 +267,48 @@ InfoBox fetch_dictionary_data(const char *query) {
if (!url)
return info;
char *cache_key = cache_compute_key(url, 0, "dictionary");
if (cache_key && get_cache_ttl_infobox() > 0) {
char *cached_data = NULL;
size_t cached_size = 0;
if (cache_get(cache_key, (time_t)get_cache_ttl_infobox(), &cached_data,
&cached_size) == 0 &&
cached_data && cached_size > 0) {
htmlDocPtr doc = htmlReadMemory(cached_data, cached_size, url, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
HTML_PARSE_NOWARNING);
if (doc) {
char *word = xpath_text(doc, "//span[@class='hw dhw']");
char *pron = xpath_text(
doc,
"//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']");
char *pos = xpath_text(doc, "//span[@class='pos dpos']");
char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]");
char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]");
if (word && def) {
info.title = strdup("Dictionary");
info.extract = build_html(word, pron, pos, def, ex);
info.thumbnail_url = strdup("/static/dictionary.jpg");
info.url = strdup(url);
}
free(word);
free(pron);
free(pos);
free(def);
free(ex);
xmlFreeDoc(doc);
}
free(cached_data);
free(cache_key);
free(url);
return info;
}
free(cached_data);
}
free(cache_key);
CURL *curl = curl_easy_init();
if (!curl) {
free(url);
@@ -281,6 +324,12 @@ InfoBox fetch_dictionary_data(const char *query) {
apply_proxy_settings(curl);
if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
cache_key = cache_compute_key(url, 0, "dictionary");
if (cache_key && get_cache_ttl_infobox() > 0) {
cache_set(cache_key, chunk.memory, chunk.size);
}
free(cache_key);
htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
HTML_PARSE_NOWARNING);

View File

@@ -1,4 +1,5 @@
#include "Wikipedia.h"
#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
#include "../Scraping/Scraping.h"
#include <curl/curl.h>
@@ -117,6 +118,32 @@ InfoBox fetch_wiki_data(char *api_url) {
struct WikiMemoryStruct chunk;
InfoBox info = {NULL, NULL, NULL, NULL};
if (!api_url) {
return info;
}
char *cache_key = cache_compute_key(api_url, 0, "wikipedia");
if (cache_key && get_cache_ttl_infobox() > 0) {
char *cached_data = NULL;
size_t cached_size = 0;
if (cache_get(cache_key, get_cache_ttl_infobox(), &cached_data,
&cached_size) == 0 &&
cached_data && cached_size > 0) {
xmlDocPtr doc =
xmlReadMemory(cached_data, cached_size, "noname.xml", NULL, 0);
if (doc != NULL) {
xmlNode *root_element = xmlDocGetRootElement(doc);
extract_wiki_info(root_element, &info);
xmlFreeDoc(doc);
}
free(cached_data);
free(cache_key);
return info;
}
free(cached_data);
}
free(cache_key);
chunk.memory = malloc(1);
chunk.size = 0;
@@ -132,7 +159,13 @@ InfoBox fetch_wiki_data(char *api_url) {
res = curl_easy_perform(curl_handle);
if (res == CURLE_OK) {
if (res == CURLE_OK && chunk.size > 0) {
cache_key = cache_compute_key(api_url, 0, "wikipedia");
if (cache_key && get_cache_ttl_infobox() > 0) {
cache_set(cache_key, chunk.memory, chunk.size);
}
free(cache_key);
xmlDocPtr doc =
xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0);
if (doc != NULL) {

View File

@@ -5,7 +5,9 @@
#include <stdio.h>
#include <stdlib.h>
#include "Cache/Cache.h"
#include "Config.h"
#include "Infobox/Wikipedia.h"
#include "Proxy/Proxy.h"
#include "Routes/Home.h"
#include "Routes/ImageProxy.h"
@@ -37,12 +39,27 @@ int main() {
.proxy_list_file = "",
.max_proxy_retries = 3,
.randomize_username = 0,
.randomize_password = 0};
.randomize_password = 0,
.cache_dir = "/tmp/omnisearch_cache",
.cache_ttl_search = 3600,
.cache_ttl_infobox = 86400};
if (load_config("config.ini", &config) != 0) {
fprintf(stderr, "Warning: Could not load config file, using defaults\n");
}
if (cache_init(config.cache_dir) != 0) {
fprintf(
stderr,
"Warning: Failed to initialize cache, continuing without caching\n");
} else {
fprintf(stderr, "Cache initialized at %s\n", config.cache_dir);
cache_cleanup(config.cache_ttl_search);
}
set_cache_ttl_search(config.cache_ttl_search);
set_cache_ttl_infobox(config.cache_ttl_infobox);
if (config.proxy_list_file[0] != '\0') {
if (load_proxy_list(config.proxy_list_file) < 0) {
fprintf(
@@ -82,5 +99,6 @@ int main() {
curl_global_cleanup();
xmlCleanupParser();
free_proxy_list();
cache_shutdown();
return EXIT_SUCCESS;
}

View File

@@ -1,4 +1,5 @@
#include "Scraping.h"
#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
#include "../Utility/Unescape.h"
#include <curl/curl.h>
@@ -368,6 +369,10 @@ retry:
for (int i = 0; i < num_jobs; i++) {
ScrapeJob *job = &jobs[i];
char cache_key[64];
char full_url[1024];
char *encoded_query = NULL;
if (job->handle) {
curl_easy_cleanup(job->handle);
job->handle = NULL;
@@ -376,20 +381,8 @@ retry:
free(job->response.memory);
}
job->handle = curl_easy_init();
if (!job->handle) {
continue;
}
job->response.memory = (char *)malloc(16384);
job->response.size = 0;
job->response.capacity = 16384;
char full_url[1024];
char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
encoded_query = curl_easy_escape(NULL, job->query, 0);
if (!encoded_query) {
curl_easy_cleanup(job->handle);
job->handle = NULL;
continue;
}
@@ -399,7 +392,52 @@ retry:
snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url,
encoded_query, job->engine->page_param, page_value);
curl_free(encoded_query);
char *key = cache_compute_key(job->query, job->page, job->engine->name);
if (key) {
strncpy(cache_key, key, sizeof(cache_key) - 1);
cache_key[sizeof(cache_key) - 1] = '\0';
free(key);
} else {
snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i,
job->engine->name);
}
char *cached_data = NULL;
size_t cached_size = 0;
int cache_hit = 0;
if (get_cache_ttl_search() > 0 &&
cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data,
&cached_size) == 0 &&
cached_data && cached_size > 0) {
xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
HTML_PARSE_NOWARNING);
if (doc) {
job->results_count = job->engine->parser(
job->engine->name, doc, job->out_results, job->max_results);
xmlFreeDoc(doc);
cache_hit = 1;
}
free(cached_data);
}
if (cache_hit) {
free(encoded_query);
job->results_count = job->results_count > 0 ? job->results_count : 0;
continue;
}
job->handle = curl_easy_init();
if (!job->handle) {
free(encoded_query);
continue;
}
job->response.memory = (char *)malloc(16384);
job->response.size = 0;
job->response.capacity = 16384;
struct curl_slist *headers = NULL;
char host_buf[256], ref_buf[256];
@@ -451,6 +489,13 @@ retry:
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
if (msg->data.result == CURLE_OK && job->response.size > 0) {
char *key =
cache_compute_key(job->query, job->page, job->engine->name);
if (key && get_cache_ttl_search() > 0) {
cache_set(key, job->response.memory, job->response.size);
free(key);
}
xmlDocPtr doc = htmlReadMemory(
job->response.memory, job->response.size, NULL, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);