feature: added caching

This commit is contained in:
frosty
2026-03-10 03:40:34 -04:00
parent a11bf8bb6c
commit e33310f263
8 changed files with 185 additions and 18 deletions

View File

@@ -1,4 +1,5 @@
#include "Scraping.h"
#include "../Cache/Cache.h"
#include "../Proxy/Proxy.h"
#include "../Utility/Unescape.h"
#include <curl/curl.h>
@@ -368,6 +369,10 @@ retry:
for (int i = 0; i < num_jobs; i++) {
ScrapeJob *job = &jobs[i];
char cache_key[64];
char full_url[1024];
char *encoded_query = NULL;
if (job->handle) {
curl_easy_cleanup(job->handle);
job->handle = NULL;
@@ -376,20 +381,8 @@ retry:
free(job->response.memory);
}
job->handle = curl_easy_init();
if (!job->handle) {
continue;
}
job->response.memory = (char *)malloc(16384);
job->response.size = 0;
job->response.capacity = 16384;
char full_url[1024];
char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
encoded_query = curl_easy_escape(NULL, job->query, 0);
if (!encoded_query) {
curl_easy_cleanup(job->handle);
job->handle = NULL;
continue;
}
@@ -399,7 +392,52 @@ retry:
snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url,
encoded_query, job->engine->page_param, page_value);
curl_free(encoded_query);
char *key = cache_compute_key(job->query, job->page, job->engine->name);
if (key) {
strncpy(cache_key, key, sizeof(cache_key) - 1);
cache_key[sizeof(cache_key) - 1] = '\0';
free(key);
} else {
snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i,
job->engine->name);
}
char *cached_data = NULL;
size_t cached_size = 0;
int cache_hit = 0;
if (get_cache_ttl_search() > 0 &&
cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data,
&cached_size) == 0 &&
cached_data && cached_size > 0) {
xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
HTML_PARSE_NOWARNING);
if (doc) {
job->results_count = job->engine->parser(
job->engine->name, doc, job->out_results, job->max_results);
xmlFreeDoc(doc);
cache_hit = 1;
}
free(cached_data);
}
if (cache_hit) {
free(encoded_query);
job->results_count = job->results_count > 0 ? job->results_count : 0;
continue;
}
job->handle = curl_easy_init();
if (!job->handle) {
free(encoded_query);
continue;
}
job->response.memory = (char *)malloc(16384);
job->response.size = 0;
job->response.capacity = 16384;
struct curl_slist *headers = NULL;
char host_buf[256], ref_buf[256];
@@ -451,6 +489,13 @@ retry:
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
if (msg->data.result == CURLE_OK && job->response.size > 0) {
char *key =
cache_compute_key(job->query, job->page, job->engine->name);
if (key && get_cache_ttl_search() > 0) {
cache_set(key, job->response.memory, job->response.size);
free(key);
}
xmlDocPtr doc = htmlReadMemory(
job->response.memory, job->response.size, NULL, NULL,
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);