fix: refactored scraping components

This commit is contained in:
frosty
2026-03-17 13:51:12 -04:00
parent 8c6632502f
commit c7b95d0571
8 changed files with 816 additions and 700 deletions

View File

@@ -3,6 +3,7 @@
#include <curl/curl.h>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
typedef struct {
char *url;
@@ -45,6 +46,25 @@ typedef struct {
extern const SearchEngine ENGINE_REGISTRY[];
extern const int ENGINE_COUNT;
size_t write_memory_callback(void *contents, size_t size, size_t nmemb,
void *userp);
const char *get_random_user_agent(void);
void configure_curl_handle(CURL *curl, const char *full_url,
MemoryBuffer *chunk, struct curl_slist *headers);
char *build_search_url(const char *base_url, const char *page_param,
int page_multiplier, int page_base,
const char *encoded_query, int page);
struct curl_slist *build_request_headers(const char *host_header,
const char *referer);
void http_delay(void);
xmlXPathContextPtr create_xpath_context(xmlDocPtr doc);
void free_xpath_objects(xmlXPathContextPtr ctx, xmlXPathObjectPtr obj);
SearchResult *alloc_results_array(int capacity, int max_results);
void assign_result(SearchResult *result, char *url, char *title, char *snippet,
int unescape);
void free_xml_node_list(char *title, char *url, char *snippet);
int scrape_engine(const SearchEngine *engine, const char *query,
SearchResult **out_results, int max_results);