refactor: put HTTP and XML logic into reusable modules
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
#include "Wikipedia.h"
|
||||
#include "../Cache/Cache.h"
|
||||
#include "../Proxy/Proxy.h"
|
||||
#include "../Scraping/Scraping.h"
|
||||
#include "../Utility/HttpClient.h"
|
||||
#include <curl/curl.h>
|
||||
#include <libxml/parser.h>
|
||||
#include <libxml/tree.h>
|
||||
@@ -9,11 +9,6 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
struct WikiMemoryStruct {
|
||||
char *memory;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
static void shorten_summary(char **extract_ptr, int max_chars) {
|
||||
if (!extract_ptr || !*extract_ptr)
|
||||
return;
|
||||
@@ -43,25 +38,6 @@ static void shorten_summary(char **extract_ptr, int max_chars) {
|
||||
}
|
||||
}
|
||||
|
||||
static size_t WikiWriteMemoryCallback(void *contents, size_t size, size_t nmemb,
|
||||
void *userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
struct WikiMemoryStruct *mem = (struct WikiMemoryStruct *)userp;
|
||||
|
||||
char *ptr = realloc(mem->memory, mem->size + realsize + 1);
|
||||
if (ptr == NULL) {
|
||||
fprintf(stderr, "Not enough memory (realloc returned NULL)\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
mem->memory = ptr;
|
||||
memcpy(&(mem->memory[mem->size]), contents, realsize);
|
||||
mem->size += realsize;
|
||||
mem->memory[mem->size] = 0;
|
||||
|
||||
return realsize;
|
||||
}
|
||||
|
||||
static void extract_wiki_info(xmlNode *node, InfoBox *info) {
|
||||
xmlNode *cur_node = NULL;
|
||||
|
||||
@@ -113,9 +89,6 @@ static void extract_wiki_info(xmlNode *node, InfoBox *info) {
|
||||
}
|
||||
|
||||
InfoBox fetch_wiki_data(char *api_url) {
|
||||
CURL *curl_handle;
|
||||
CURLcode res;
|
||||
struct WikiMemoryStruct chunk;
|
||||
InfoBox info = {NULL, NULL, NULL, NULL};
|
||||
|
||||
if (!api_url) {
|
||||
@@ -144,47 +117,31 @@ InfoBox fetch_wiki_data(char *api_url) {
|
||||
}
|
||||
free(cache_key);
|
||||
|
||||
chunk.memory = malloc(1);
|
||||
chunk.size = 0;
|
||||
|
||||
curl_handle = curl_easy_init();
|
||||
|
||||
if (curl_handle) {
|
||||
curl_easy_setopt(curl_handle, CURLOPT_URL, api_url);
|
||||
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION,
|
||||
WikiWriteMemoryCallback);
|
||||
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
|
||||
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
|
||||
apply_proxy_settings(curl_handle);
|
||||
|
||||
res = curl_easy_perform(curl_handle);
|
||||
|
||||
if (res == CURLE_OK && chunk.size > 0) {
|
||||
cache_key = cache_compute_key(api_url, 0, "wikipedia");
|
||||
if (cache_key && get_cache_ttl_infobox() > 0) {
|
||||
cache_set(cache_key, chunk.memory, chunk.size);
|
||||
}
|
||||
free(cache_key);
|
||||
|
||||
xmlDocPtr doc =
|
||||
xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0);
|
||||
if (doc != NULL) {
|
||||
xmlNode *root_element = xmlDocGetRootElement(doc);
|
||||
extract_wiki_info(root_element, &info);
|
||||
xmlFreeDoc(doc);
|
||||
}
|
||||
HttpResponse resp = http_get(api_url, "libcurl-agent/1.0");
|
||||
if (resp.memory && resp.size > 0) {
|
||||
cache_key = cache_compute_key(api_url, 0, "wikipedia");
|
||||
if (cache_key && get_cache_ttl_infobox() > 0) {
|
||||
cache_set(cache_key, resp.memory, resp.size);
|
||||
}
|
||||
free(cache_key);
|
||||
|
||||
curl_easy_cleanup(curl_handle);
|
||||
free(chunk.memory);
|
||||
xmlDocPtr doc =
|
||||
xmlReadMemory(resp.memory, resp.size, "noname.xml", NULL, 0);
|
||||
if (doc != NULL) {
|
||||
xmlNode *root_element = xmlDocGetRootElement(doc);
|
||||
extract_wiki_info(root_element, &info);
|
||||
xmlFreeDoc(doc);
|
||||
}
|
||||
}
|
||||
|
||||
http_response_free(&resp);
|
||||
return info;
|
||||
}
|
||||
|
||||
static xmlNode *find_node_recursive(xmlNode *node, const char *target_name) {
|
||||
for (xmlNode *cur = node; cur; cur = cur->next) {
|
||||
if (cur->type == XML_ELEMENT_NODE && strcmp((const char *)cur->name, target_name) == 0) {
|
||||
if (cur->type == XML_ELEMENT_NODE &&
|
||||
strcmp((const char *)cur->name, target_name) == 0) {
|
||||
return cur;
|
||||
}
|
||||
xmlNode *found = find_node_recursive(cur->children, target_name);
|
||||
@@ -195,21 +152,15 @@ static xmlNode *find_node_recursive(xmlNode *node, const char *target_name) {
|
||||
}
|
||||
|
||||
static char *get_first_search_result(const char *search_term) {
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl)
|
||||
return NULL;
|
||||
|
||||
char *escaped_term = curl_easy_escape(curl, search_term, 0);
|
||||
char *escaped_term = curl_easy_escape(NULL, search_term, 0);
|
||||
const char *search_base =
|
||||
"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=";
|
||||
const char *search_suffix =
|
||||
"&format=xml&origin=*&srlimit=1";
|
||||
const char *search_suffix = "&format=xml&origin=*&srlimit=1";
|
||||
|
||||
char *search_url = malloc(strlen(search_base) + strlen(escaped_term) +
|
||||
strlen(search_suffix) + 1);
|
||||
strlen(search_suffix) + 1);
|
||||
if (!search_url) {
|
||||
curl_free(escaped_term);
|
||||
curl_easy_cleanup(curl);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -219,22 +170,13 @@ static char *get_first_search_result(const char *search_term) {
|
||||
|
||||
curl_free(escaped_term);
|
||||
|
||||
struct WikiMemoryStruct chunk = {malloc(1), 0};
|
||||
if (!chunk.memory) {
|
||||
free(search_url);
|
||||
curl_easy_cleanup(curl);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, search_url);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WikiWriteMemoryCallback);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk);
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "libcurl-agent/1.0");
|
||||
apply_proxy_settings(curl);
|
||||
HttpResponse resp = http_get(search_url, "libcurl-agent/1.0");
|
||||
free(search_url);
|
||||
|
||||
char *first_title = NULL;
|
||||
if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
|
||||
xmlDocPtr doc = xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0);
|
||||
if (resp.memory && resp.size > 0) {
|
||||
xmlDocPtr doc =
|
||||
xmlReadMemory(resp.memory, resp.size, "noname.xml", NULL, 0);
|
||||
if (doc) {
|
||||
xmlNode *root = xmlDocGetRootElement(doc);
|
||||
xmlNode *search_node = find_node_recursive(root, "search");
|
||||
@@ -255,10 +197,7 @@ static char *get_first_search_result(const char *search_term) {
|
||||
}
|
||||
}
|
||||
|
||||
free(chunk.memory);
|
||||
free(search_url);
|
||||
curl_easy_cleanup(curl);
|
||||
|
||||
http_response_free(&resp);
|
||||
return first_title;
|
||||
}
|
||||
|
||||
@@ -267,13 +206,7 @@ char *construct_wiki_url(const char *search_term) {
|
||||
if (!first_title)
|
||||
return NULL;
|
||||
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
free(first_title);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
char *escaped_title = curl_easy_escape(curl, first_title, 0);
|
||||
char *escaped_title = curl_easy_escape(NULL, first_title, 0);
|
||||
const char *base = "https://en.wikipedia.org/w/"
|
||||
"api.php?action=query&prop=extracts|pageimages&exintro&"
|
||||
"explaintext&pithumbsize=400&format=xml&origin=*&titles=";
|
||||
@@ -285,7 +218,6 @@ char *construct_wiki_url(const char *search_term) {
|
||||
}
|
||||
|
||||
curl_free(escaped_title);
|
||||
curl_easy_cleanup(curl);
|
||||
free(first_title);
|
||||
return full_url;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user