feature: added caching
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
#include "Scraping.h"
|
||||
#include "../Cache/Cache.h"
|
||||
#include "../Proxy/Proxy.h"
|
||||
#include "../Utility/Unescape.h"
|
||||
#include <curl/curl.h>
|
||||
@@ -368,6 +369,10 @@ retry:
|
||||
for (int i = 0; i < num_jobs; i++) {
|
||||
ScrapeJob *job = &jobs[i];
|
||||
|
||||
char cache_key[64];
|
||||
char full_url[1024];
|
||||
char *encoded_query = NULL;
|
||||
|
||||
if (job->handle) {
|
||||
curl_easy_cleanup(job->handle);
|
||||
job->handle = NULL;
|
||||
@@ -376,20 +381,8 @@ retry:
|
||||
free(job->response.memory);
|
||||
}
|
||||
|
||||
job->handle = curl_easy_init();
|
||||
if (!job->handle) {
|
||||
continue;
|
||||
}
|
||||
|
||||
job->response.memory = (char *)malloc(16384);
|
||||
job->response.size = 0;
|
||||
job->response.capacity = 16384;
|
||||
|
||||
char full_url[1024];
|
||||
char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
|
||||
encoded_query = curl_easy_escape(NULL, job->query, 0);
|
||||
if (!encoded_query) {
|
||||
curl_easy_cleanup(job->handle);
|
||||
job->handle = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -399,7 +392,52 @@ retry:
|
||||
|
||||
snprintf(full_url, sizeof(full_url), "%s%s&%s=%d", job->engine->base_url,
|
||||
encoded_query, job->engine->page_param, page_value);
|
||||
curl_free(encoded_query);
|
||||
|
||||
char *key = cache_compute_key(job->query, job->page, job->engine->name);
|
||||
if (key) {
|
||||
strncpy(cache_key, key, sizeof(cache_key) - 1);
|
||||
cache_key[sizeof(cache_key) - 1] = '\0';
|
||||
free(key);
|
||||
} else {
|
||||
snprintf(cache_key, sizeof(cache_key), "uncached_%d_%s", i,
|
||||
job->engine->name);
|
||||
}
|
||||
|
||||
char *cached_data = NULL;
|
||||
size_t cached_size = 0;
|
||||
int cache_hit = 0;
|
||||
|
||||
if (get_cache_ttl_search() > 0 &&
|
||||
cache_get(cache_key, (time_t)get_cache_ttl_search(), &cached_data,
|
||||
&cached_size) == 0 &&
|
||||
cached_data && cached_size > 0) {
|
||||
xmlDocPtr doc = htmlReadMemory(cached_data, cached_size, NULL, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR |
|
||||
HTML_PARSE_NOWARNING);
|
||||
if (doc) {
|
||||
job->results_count = job->engine->parser(
|
||||
job->engine->name, doc, job->out_results, job->max_results);
|
||||
xmlFreeDoc(doc);
|
||||
cache_hit = 1;
|
||||
}
|
||||
free(cached_data);
|
||||
}
|
||||
|
||||
if (cache_hit) {
|
||||
free(encoded_query);
|
||||
job->results_count = job->results_count > 0 ? job->results_count : 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
job->handle = curl_easy_init();
|
||||
if (!job->handle) {
|
||||
free(encoded_query);
|
||||
continue;
|
||||
}
|
||||
|
||||
job->response.memory = (char *)malloc(16384);
|
||||
job->response.size = 0;
|
||||
job->response.capacity = 16384;
|
||||
|
||||
struct curl_slist *headers = NULL;
|
||||
char host_buf[256], ref_buf[256];
|
||||
@@ -451,6 +489,13 @@ retry:
|
||||
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
|
||||
|
||||
if (msg->data.result == CURLE_OK && job->response.size > 0) {
|
||||
char *key =
|
||||
cache_compute_key(job->query, job->page, job->engine->name);
|
||||
if (key && get_cache_ttl_search() > 0) {
|
||||
cache_set(key, job->response.memory, job->response.size);
|
||||
free(key);
|
||||
}
|
||||
|
||||
xmlDocPtr doc = htmlReadMemory(
|
||||
job->response.memory, job->response.size, NULL, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
|
||||
Reference in New Issue
Block a user