feature: added caching
This commit is contained in:
2
Makefile
2
Makefile
@@ -10,7 +10,7 @@ else
|
||||
LDFLAGS :=
|
||||
endif
|
||||
|
||||
LIBS := -lbeaker -lcurl -lxml2 -lpthread -lm
|
||||
LIBS := -lbeaker -lcurl -lxml2 -lpthread -lm -lssl -lcrypto
|
||||
|
||||
SRC_DIR := src
|
||||
BIN_DIR := bin
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[server]
|
||||
host = 0.0.0.0
|
||||
port = 8000
|
||||
|
||||
|
||||
[proxy]
|
||||
# Single proxy (comment out to use list_file instead)
|
||||
#proxy = "socks5://127.0.0.1:9050"
|
||||
@@ -14,3 +14,13 @@ port = 8000
|
||||
# Randomize proxy credentials for each request
|
||||
#randomize_username = true
|
||||
#randomize_password = true
|
||||
|
||||
[cache]
|
||||
# Directory to store cached responses
|
||||
#dir = /tmp/omnisearch_cache
|
||||
|
||||
# Cache TTL for search results in seconds (default: 3600 = 1 hour)
|
||||
#ttl_search = 3600
|
||||
|
||||
# Cache TTL for infobox data in seconds (default: 86400 = 24 hours)
|
||||
#ttl_infobox = 86400
|
||||
|
||||
@@ -80,6 +80,15 @@ int load_config(const char *filename, Config *config) {
|
||||
} else if (strcmp(key, "randomize_password") == 0) {
|
||||
config->randomize_password = atoi(value);
|
||||
}
|
||||
} else if (strcmp(section, "cache") == 0) {
|
||||
if (strcmp(key, "dir") == 0) {
|
||||
strncpy(config->cache_dir, value, sizeof(config->cache_dir) - 1);
|
||||
config->cache_dir[sizeof(config->cache_dir) - 1] = '\0';
|
||||
} else if (strcmp(key, "ttl_search") == 0) {
|
||||
config->cache_ttl_search = atoi(value);
|
||||
} else if (strcmp(key, "ttl_infobox") == 0) {
|
||||
config->cache_ttl_infobox = atoi(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,6 +9,9 @@ typedef struct {
|
||||
int max_proxy_retries;
|
||||
int randomize_username;
|
||||
int randomize_password;
|
||||
char cache_dir[512];
|
||||
int cache_ttl_search;
|
||||
int cache_ttl_infobox;
|
||||
} Config;
|
||||
|
||||
int load_config(const char *filename, Config *config);
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "Dictionary.h"
|
||||
#include "../Cache/Cache.h"
|
||||
#include "../Proxy/Proxy.h"
|
||||
#include "../Scraping/Scraping.h"
|
||||
#include <ctype.h>
|
||||
@@ -266,6 +267,48 @@ InfoBox fetch_dictionary_data(const char *query) {
|
||||
if (!url)
|
||||
return info;
|
||||
|
||||
char *cache_key = cache_compute_key(url, 0, "dictionary");
|
||||
if (cache_key && get_cache_ttl_infobox() > 0) {
|
||||
char *cached_data = NULL;
|
||||
size_t cached_size = 0;
|
||||
if (cache_get(cache_key, (time_t)get_cache_ttl_infobox(), &cached_data,
|
||||
&cached_size) == 0 &&
|
||||
cached_data && cached_size > 0) {
|
||||
htmlDocPtr doc = htmlReadMemory(cached_data, cached_size, url, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
|
||||
HTML_PARSE_NOWARNING);
|
||||
if (doc) {
|
||||
char *word = xpath_text(doc, "//span[@class='hw dhw']");
|
||||
char *pron = xpath_text(
|
||||
doc,
|
||||
"//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']");
|
||||
char *pos = xpath_text(doc, "//span[@class='pos dpos']");
|
||||
char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]");
|
||||
char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]");
|
||||
|
||||
if (word && def) {
|
||||
info.title = strdup("Dictionary");
|
||||
info.extract = build_html(word, pron, pos, def, ex);
|
||||
info.thumbnail_url = strdup("/static/dictionary.jpg");
|
||||
info.url = strdup(url);
|
||||
}
|
||||
|
||||
free(word);
|
||||
free(pron);
|
||||
free(pos);
|
||||
free(def);
|
||||
free(ex);
|
||||
xmlFreeDoc(doc);
|
||||
}
|
||||
free(cached_data);
|
||||
free(cache_key);
|
||||
free(url);
|
||||
return info;
|
||||
}
|
||||
free(cached_data);
|
||||
}
|
||||
free(cache_key);
|
||||
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
free(url);
|
||||
@@ -281,6 +324,12 @@ InfoBox fetch_dictionary_data(const char *query) {
|
||||
apply_proxy_settings(curl);
|
||||
|
||||
if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
|
||||
cache_key = cache_compute_key(url, 0, "dictionary");
|
||||
if (cache_key && get_cache_ttl_infobox() > 0) {
|
||||
cache_set(cache_key, chunk.memory, chunk.size);
|
||||
}
|
||||
free(cache_key);
|
||||
|
||||
htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
|
||||
HTML_PARSE_NOWARNING);
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "Wikipedia.h"
|
||||
#include "../Cache/Cache.h"
|
||||
#include "../Proxy/Proxy.h"
|
||||
#include "../Scraping/Scraping.h"
|
||||
#include <curl/curl.h>
|
||||
@@ -117,6 +118,32 @@ InfoBox fetch_wiki_data(char *api_url) {
|
||||
struct WikiMemoryStruct chunk;
|
||||
InfoBox info = {NULL, NULL, NULL, NULL};
|
||||
|
||||
if (!api_url) {
|
||||
return info;
|
||||
}
|
||||
|
||||
char *cache_key = cache_compute_key(api_url, 0, "wikipedia");
|
||||
if (cache_key && get_cache_ttl_infobox() > 0) {
|
||||
char *cached_data = NULL;
|
||||
size_t cached_size = 0;
|
||||
if (cache_get(cache_key, get_cache_ttl_infobox(), &cached_data,
|
||||
&cached_size) == 0 &&
|
||||
cached_data && cached_size > 0) {
|
||||
xmlDocPtr doc =
|
||||
xmlReadMemory(cached_data, cached_size, "noname.xml", NULL, 0);
|
||||
if (doc != NULL) {
|
||||
xmlNode *root_element = xmlDocGetRootElement(doc);
|
||||
extract_wiki_info(root_element, &info);
|
||||
xmlFreeDoc(doc);
|
||||
}
|
||||
free(cached_data);
|
||||
free(cache_key);
|
||||
return info;
|
||||
}
|
||||
free(cached_data);
|
||||
}
|
||||
free(cache_key);
|
||||
|
||||
chunk.memory = malloc(1);
|
||||
chunk.size = 0;
|
||||
|
||||
@@ -132,7 +159,13 @@ InfoBox fetch_wiki_data(char *api_url) {
|
||||
|
||||
res = curl_easy_perform(curl_handle);
|
||||
|
||||
if (res == CURLE_OK) {
|
||||
if (res == CURLE_OK && chunk.size > 0) {
|
||||
cache_key = cache_compute_key(api_url, 0, "wikipedia");
|
||||
if (cache_key && get_cache_ttl_infobox() > 0) {
|
||||
cache_set(cache_key, chunk.memory, chunk.size);
|
||||
}
|
||||
free(cache_key);
|
||||
|
||||
xmlDocPtr doc =
|
||||
xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0);
|
||||
if (doc != NULL) {
|
||||
|
||||
20
src/Main.c
20
src/Main.c
@@ -5,7 +5,9 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "Cache/Cache.h"
|
||||
#include "Config.h"
|
||||
#include "Infobox/Wikipedia.h"
|
||||
#include "Proxy/Proxy.h"
|
||||
#include "Routes/Home.h"
|
||||
#include "Routes/ImageProxy.h"
|
||||
@@ -37,12 +39,27 @@ int main() {
|
||||
.proxy_list_file = "",
|
||||
.max_proxy_retries = 3,
|
||||
.randomize_username = 0,
|
||||
.randomize_password = 0};
|
||||
.randomize_password = 0,
|
||||
.cache_dir = "/tmp/omnisearch_cache",
|
||||
.cache_ttl_search = 3600,
|
||||
.cache_ttl_infobox = 86400};
|
||||
|
||||
if (load_config("config.ini", &config) != 0) {
|
||||
fprintf(stderr, "Warning: Could not load config file, using defaults\n");
|
||||
}
|
||||
|
||||
if (cache_init(config.cache_dir) != 0) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"Warning: Failed to initialize cache, continuing without caching\n");
|
||||
} else {
|
||||
fprintf(stderr, "Cache initialized at %s\n", config.cache_dir);
|
||||
cache_cleanup(config.cache_ttl_search);
|
||||
}
|
||||
|
||||
set_cache_ttl_search(config.cache_ttl_search);
|
||||
set_cache_ttl_infobox(config.cache_ttl_infobox);
|
||||
|
||||
if (config.proxy_list_file[0] != '\0') {
|
||||
if (load_proxy_list(config.proxy_list_file) < 0) {
|
||||
fprintf(
|
||||
@@ -82,5 +99,6 @@ int main() {
|
||||
curl_global_cleanup();
|
||||
xmlCleanupParser();
|
||||
free_proxy_list();
|
||||
cache_shutdown();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "Scraping.h"
|
||||
#include "../Cache/Cache.h"
|
||||
#include "../Proxy/Proxy.h"
|
||||
#include "../Utility/Unescape.h"
|
||||
#include <curl/curl.h>
|
||||
@@ -368,6 +369,10 @@ retry:
|
||||
for (int i = 0; i < num_jobs; i++) {
|
||||
ScrapeJob *job = &jobs[i];
|
||||
|
||||
char cache_key[64];
|
||||
char full_url[1024];
|
||||
char *encoded_query = NULL;
|
||||
|
||||
if (job->handle) {
|
||||
curl_easy_cleanup(job->handle);
|
||||
job->handle = NULL;
|
||||
@@ -376,20 +381,8 @@ retry:
|
||||
free(job->response.memory);
|
||||
}
|
||||
|
||||
job->handle = curl_easy_init();
|
||||
if (!job->handle) {
|
||||
continue;
|
||||
}
|
||||
|
||||
job->response.memory = (char *)malloc(16384);
|
||||
job->response.size = 0;
|
||||
job->response.capacity = 16384;
|
||||
|
||||
char full_url[1024];
|
||||
char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
|
||||
encoded_query = curl_easy_escape(NULL, job->query, 0);
|
||||
if (!encoded_query) {
|
||||
curl_easy_cleanup(job->handle);
|
||||
job->handle = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -399,7 +392,52 @@ retry:
|
||||
|
||||
snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url,
|
||||
encoded_query, job->engine->page_param, page_value);
|
||||
curl_free(encoded_query);
|
||||
|
||||
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
||||
if (key) {
|
||||
strncpy(cache_key, key, sizeof(cache_key) - 1);
|
||||
cache_key[sizeof(cache_key) - 1] = '\0';
|
||||
free(key);
|
||||
} else {
|
||||
snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i,
|
||||
job->engine->name);
|
||||
}
|
||||
|
||||
char *cached_data = NULL;
|
||||
size_t cached_size = 0;
|
||||
int cache_hit = 0;
|
||||
|
||||
if (get_cache_ttl_search() > 0 &&
|
||||
cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data,
|
||||
&cached_size) == 0 &&
|
||||
cached_data && cached_size > 0) {
|
||||
xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
|
||||
HTML_PARSE_NOWARNING);
|
||||
if (doc) {
|
||||
job->results_count = job->engine->parser(
|
||||
job->engine->name, doc, job->out_results, job->max_results);
|
||||
xmlFreeDoc(doc);
|
||||
cache_hit = 1;
|
||||
}
|
||||
free(cached_data);
|
||||
}
|
||||
|
||||
if (cache_hit) {
|
||||
free(encoded_query);
|
||||
job->results_count = job->results_count > 0 ? job->results_count : 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
job->handle = curl_easy_init();
|
||||
if (!job->handle) {
|
||||
free(encoded_query);
|
||||
continue;
|
||||
}
|
||||
|
||||
job->response.memory = (char *)malloc(16384);
|
||||
job->response.size = 0;
|
||||
job->response.capacity = 16384;
|
||||
|
||||
struct curl_slist *headers = NULL;
|
||||
char host_buf[256], ref_buf[256];
|
||||
@@ -451,6 +489,13 @@ retry:
|
||||
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
|
||||
|
||||
if (msg->data.result == CURLE_OK && job->response.size > 0) {
|
||||
char *key =
|
||||
cache_compute_key(job->query, job->page, job->engine->name);
|
||||
if (key && get_cache_ttl_search() > 0) {
|
||||
cache_set(key, job->response.memory, job->response.size);
|
||||
free(key);
|
||||
}
|
||||
|
||||
xmlDocPtr doc = htmlReadMemory(
|
||||
job->response.memory, job->response.size, NULL, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
|
||||
Reference in New Issue
Block a user