oopsies
This commit is contained in:
67
src/Config.c
Normal file
67
src/Config.c
Normal file
@@ -0,0 +1,67 @@
|
||||
#include "Config.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
int load_config(const char *filename, Config *config) {
|
||||
FILE *file = fopen(filename, "r");
|
||||
if (!file) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
char line[512];
|
||||
char section[64] = "";
|
||||
|
||||
while (fgets(line, sizeof(line), file)) {
|
||||
|
||||
line[strcspn(line, "\r\n")] = 0;
|
||||
|
||||
if (line[0] == '\0' || line[0] == '#' || line[0] == ';') {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line[0] == '[') {
|
||||
char *end = strchr(line, ']');
|
||||
if (end) {
|
||||
*end = '\0';
|
||||
snprintf(section, sizeof(section), "%.*s", (int)(sizeof(section) - 1), line + 1);
|
||||
section[sizeof(section) - 1] = '\0';
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
char *delimiter = strchr(line, '=');
|
||||
if (delimiter) {
|
||||
*delimiter = '\0';
|
||||
char *key = line;
|
||||
char *value = delimiter + 1;
|
||||
|
||||
while (*key == ' ' || *key == '\t') key++;
|
||||
while (*value == ' ' || *value == '\t') value++;
|
||||
|
||||
char *key_end = key + strlen(key) - 1;
|
||||
while (key_end > key && (*key_end == ' ' || *key_end == '\t')) {
|
||||
*key_end = '\0';
|
||||
key_end--;
|
||||
}
|
||||
|
||||
char *value_end = value + strlen(value) - 1;
|
||||
while (value_end > value && (*value_end == ' ' || *value_end == '\t')) {
|
||||
*value_end = '\0';
|
||||
value_end--;
|
||||
}
|
||||
|
||||
if (strcmp(section, "server") == 0) {
|
||||
if (strcmp(key, "host") == 0) {
|
||||
strncpy(config->host, value, sizeof(config->host) - 1);
|
||||
config->host[sizeof(config->host) - 1] = '\0';
|
||||
} else if (strcmp(key, "port") == 0) {
|
||||
config->port = atoi(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fclose(file);
|
||||
return 0;
|
||||
}
|
||||
11
src/Config.h
Normal file
11
src/Config.h
Normal file
@@ -0,0 +1,11 @@
|
||||
#ifndef CONFIG_H
|
||||
#define CONFIG_H
|
||||
|
||||
typedef struct {
|
||||
char host[256];
|
||||
int port;
|
||||
} Config;
|
||||
|
||||
int load_config(const char *filename, Config *config);
|
||||
|
||||
#endif
|
||||
115
src/Infobox/Calculator.c
Normal file
115
src/Infobox/Calculator.c
Normal file
@@ -0,0 +1,115 @@
|
||||
#include "Calculator.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static char logic_log[4096];
|
||||
|
||||
typedef struct {
|
||||
const char *buffer;
|
||||
int pos;
|
||||
} Parser;
|
||||
|
||||
static double parse_expression(Parser *p);
|
||||
|
||||
static void skip_ws(Parser *p) {
|
||||
while (p->buffer[p->pos] == ' ') p->pos++;
|
||||
}
|
||||
|
||||
static double parse_factor(Parser *p) {
|
||||
skip_ws(p);
|
||||
if (p->buffer[p->pos] == '-') {
|
||||
p->pos++;
|
||||
return -parse_factor(p);
|
||||
}
|
||||
if (p->buffer[p->pos] == '(') {
|
||||
p->pos++;
|
||||
double res = parse_expression(p);
|
||||
if (p->buffer[p->pos] == ')') p->pos++;
|
||||
return res;
|
||||
}
|
||||
char *endptr;
|
||||
double val = strtod(&p->buffer[p->pos], &endptr);
|
||||
p->pos = (int)(endptr - p->buffer);
|
||||
return val;
|
||||
}
|
||||
|
||||
static double parse_term(Parser *p) {
|
||||
double left = parse_factor(p);
|
||||
while (1) {
|
||||
skip_ws(p);
|
||||
char op = p->buffer[p->pos];
|
||||
if (op == '*' || op == '/') {
|
||||
p->pos++;
|
||||
double right = parse_factor(p);
|
||||
double old = left;
|
||||
left = (op == '*') ? left * right : left / right;
|
||||
|
||||
char step[256];
|
||||
|
||||
snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op,
|
||||
right, left);
|
||||
strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1);
|
||||
} else
|
||||
break;
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
static double parse_expression(Parser *p) {
|
||||
double left = parse_term(p);
|
||||
while (1) {
|
||||
skip_ws(p);
|
||||
char op = p->buffer[p->pos];
|
||||
if (op == '+' || op == '-') {
|
||||
p->pos++;
|
||||
double right = parse_term(p);
|
||||
double old = left;
|
||||
left = (op == '+') ? left + right : left - right;
|
||||
|
||||
char step[256];
|
||||
|
||||
snprintf(step, sizeof(step), "<div>%g %c %g = <b>%g</b></div>", old, op,
|
||||
right, left);
|
||||
strncat(logic_log, step, sizeof(logic_log) - strlen(logic_log) - 1);
|
||||
} else
|
||||
break;
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
double evaluate(const char *expr) {
|
||||
logic_log[0] = '\0';
|
||||
if (!expr || strlen(expr) == 0) return 0.0;
|
||||
Parser p = {expr, 0};
|
||||
return parse_expression(&p);
|
||||
}
|
||||
|
||||
InfoBox fetch_calc_data(char *math_input) {
|
||||
InfoBox info = {NULL, NULL, NULL, NULL};
|
||||
if (!math_input) return info;
|
||||
|
||||
double result = evaluate(math_input);
|
||||
|
||||
char html_output[5120];
|
||||
snprintf(html_output, sizeof(html_output),
|
||||
"<div class='calc-container' style='line-height: 1.6;'>"
|
||||
"%s"
|
||||
"<div style='margin-top: 8px; border-top: 1px solid #eee; "
|
||||
"padding-top: 8px; font-size: 1.2em;'>"
|
||||
"<b>%g</b>"
|
||||
"</div>"
|
||||
"</div>",
|
||||
strlen(logic_log) > 0 ? logic_log : "<div>Constant value</div>",
|
||||
result);
|
||||
|
||||
info.title = strdup("Calculation");
|
||||
info.extract = strdup(html_output);
|
||||
info.thumbnail_url =
|
||||
strdup("/static/calculation.svg");
|
||||
info.url = strdup("#");
|
||||
|
||||
return info;
|
||||
}
|
||||
9
src/Infobox/Calculator.h
Normal file
9
src/Infobox/Calculator.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef CALCULATOR_H
|
||||
#define CALCULATOR_H
|
||||
|
||||
#include "Infobox.h"
|
||||
|
||||
double evaluate(const char *expr);
|
||||
InfoBox fetch_calc_data(char *math_input);
|
||||
|
||||
#endif
|
||||
246
src/Infobox/Dictionary.c
Normal file
246
src/Infobox/Dictionary.c
Normal file
@@ -0,0 +1,246 @@
|
||||
#include "Dictionary.h"
|
||||
#include <curl/curl.h>
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <libxml/xpath.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static const char *PREFIXES[] = {
|
||||
"what is the definition of ", "what's the definition of ",
|
||||
"what is the meaning of ", "what's the meaning of ",
|
||||
"what does the word ", "definition of ", "meaning of ", "def of ",
|
||||
"define ", "definition ", "define:", "def ", "def:",
|
||||
"what does ", "what is ", "what's ", "whats ",
|
||||
"meaning ", "dictionary ", "dict ", NULL
|
||||
};
|
||||
|
||||
static const char *SUFFIXES[] = {
|
||||
" definition", " def", " meaning", " mean", " means",
|
||||
" dictionary", " dict", " define", " defined",
|
||||
" definition?", " def?", " meaning?", " mean?", " means?",
|
||||
" in english", " in english?", NULL
|
||||
};
|
||||
|
||||
static const char *SKIP_WORDS[] = {"of ", "the ", "a ", "an ", NULL};
|
||||
|
||||
static const char *strcasestr_impl(const char *haystack, const char *needle) {
|
||||
if (!haystack || !needle || !*needle) return haystack;
|
||||
size_t len = strlen(needle);
|
||||
for (const char *h = haystack; *h; h++) {
|
||||
if (strncasecmp(h, needle, len) == 0) return h;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct MemStruct { char *memory; size_t size; };
|
||||
|
||||
static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
struct MemStruct *mem = (struct MemStruct *)userp;
|
||||
char *ptr = realloc(mem->memory, mem->size + realsize + 1);
|
||||
if (!ptr) return 0;
|
||||
mem->memory = ptr;
|
||||
memcpy(&(mem->memory[mem->size]), contents, realsize);
|
||||
mem->size += realsize;
|
||||
mem->memory[mem->size] = 0;
|
||||
return realsize;
|
||||
}
|
||||
|
||||
static char *xpath_text(xmlDocPtr doc, const char *xpath) {
|
||||
xmlXPathContextPtr ctx = xmlXPathNewContext(doc);
|
||||
if (!ctx) return NULL;
|
||||
xmlXPathObjectPtr obj = xmlXPathEvalExpression((const xmlChar *)xpath, ctx);
|
||||
xmlXPathFreeContext(ctx);
|
||||
if (!obj || !obj->nodesetval || obj->nodesetval->nodeNr == 0) {
|
||||
if (obj) xmlXPathFreeObject(obj);
|
||||
return NULL;
|
||||
}
|
||||
xmlChar *content = xmlNodeGetContent(obj->nodesetval->nodeTab[0]);
|
||||
char *result = content ? strdup((char *)content) : NULL;
|
||||
if (content) xmlFree(content);
|
||||
xmlXPathFreeObject(obj);
|
||||
return result;
|
||||
}
|
||||
|
||||
static char *build_html(const char *word, const char *pron, const char *pos,
|
||||
const char *def, const char *ex) {
|
||||
char html[4096];
|
||||
int n = snprintf(html, sizeof(html), "<div class='dict-container' style='line-height: 1.6;'>");
|
||||
if (word) n += snprintf(html + n, sizeof(html) - n,
|
||||
"<div style='font-size: 1.3em; font-weight: bold; margin-bottom: 4px;'>%s</div>", word);
|
||||
if (pron) n += snprintf(html + n, sizeof(html) - n,
|
||||
"<div style='color: #666; margin-bottom: 8px;'>/%s/</div>", pron);
|
||||
if (pos) n += snprintf(html + n, sizeof(html) - n,
|
||||
"<div style='font-style: italic; color: #888; margin-bottom: 8px;'>%s</div>", pos);
|
||||
if (def) n += snprintf(html + n, sizeof(html) - n,
|
||||
"<div style='margin-bottom: 8px;'>%s</div>", def);
|
||||
if (ex) n += snprintf(html + n, sizeof(html) - n,
|
||||
"<div style='color: #555; font-style: italic; margin-top: 8px;'>\"%s\"</div>", ex);
|
||||
snprintf(html + n, sizeof(html) - n, "</div>");
|
||||
return strdup(html);
|
||||
}
|
||||
|
||||
static char *extract_word(const char *query) {
|
||||
if (!query) return NULL;
|
||||
|
||||
const char *start = query;
|
||||
|
||||
for (int i = 0; PREFIXES[i]; i++) {
|
||||
size_t len = strlen(PREFIXES[i]);
|
||||
if (strncasecmp(start, PREFIXES[i], len) == 0) {
|
||||
start += len;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
while (*start == ' ') start++;
|
||||
char *word = strdup(start);
|
||||
if (!word) return NULL;
|
||||
|
||||
int changed = 1;
|
||||
while (changed) {
|
||||
changed = 0;
|
||||
for (int i = 0; SKIP_WORDS[i]; i++) {
|
||||
size_t len = strlen(SKIP_WORDS[i]);
|
||||
if (strncasecmp(word, SKIP_WORDS[i], len) == 0) {
|
||||
memmove(word, word + len, strlen(word + len) + 1);
|
||||
changed = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
changed = 1;
|
||||
while (changed) {
|
||||
changed = 0;
|
||||
for (int i = 0; SUFFIXES[i]; i++) {
|
||||
const char *found = strcasestr_impl(word, SUFFIXES[i]);
|
||||
if (found) {
|
||||
char *pos = word + (found - word);
|
||||
*pos = '\0';
|
||||
changed = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t len = strlen(word);
|
||||
while (len > 0 && (word[len-1] == ' ' || word[len-1] == '?' ||
|
||||
word[len-1] == '!' || word[len-1] == '.')) {
|
||||
word[--len] = '\0';
|
||||
}
|
||||
|
||||
if (len == 0) { free(word); return NULL; }
|
||||
|
||||
for (size_t i = 0; i < len; i++) word[i] = tolower((unsigned char)word[i]);
|
||||
char *space = strchr(word, ' ');
|
||||
if (space) *space = '\0';
|
||||
|
||||
return word;
|
||||
}
|
||||
|
||||
int is_dictionary_query(const char *query) {
|
||||
if (!query) return 0;
|
||||
|
||||
for (int i = 0; PREFIXES[i]; i++) {
|
||||
size_t len = strlen(PREFIXES[i]);
|
||||
if (strncasecmp(query, PREFIXES[i], len) == 0) {
|
||||
const char *after = query + len;
|
||||
while (*after == ' ') after++;
|
||||
if (*after != '\0') return 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; SUFFIXES[i]; i++) {
|
||||
const char *pos = strcasestr_impl(query, SUFFIXES[i]);
|
||||
if (pos) {
|
||||
const char *after = pos + strlen(SUFFIXES[i]);
|
||||
while (*after == ' ' || *after == '?' || *after == '!' || *after == '.') after++;
|
||||
if (*after == '\0' && pos > query && (pos - query) < 100) return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (strncasecmp(query, "what is ", 8) == 0 ||
|
||||
strncasecmp(query, "what's ", 7) == 0 ||
|
||||
strncasecmp(query, "whats ", 6) == 0) {
|
||||
const char *word = query + (strncasecmp(query, "what is ", 8) == 0 ? 8 :
|
||||
strncasecmp(query, "what's ", 7) == 0 ? 7 : 6);
|
||||
const char *articles[] = {"the ", "your ", "my ", "his ", "her ", "their ",
|
||||
"our ", "this ", "that ", "these ", "those ", "a ", "an ", NULL};
|
||||
for (int i = 0; articles[i]; i++) {
|
||||
if (strncasecmp(word, articles[i], strlen(articles[i])) == 0) return 0;
|
||||
}
|
||||
const char *space = strchr(word, ' ');
|
||||
if (!space || *(space + 1) == '\0' || *(space + 1) == '?') return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *construct_dictionary_url(const char *query) {
|
||||
char *word = extract_word(query);
|
||||
if (!word) return NULL;
|
||||
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) { free(word); return NULL; }
|
||||
|
||||
char *escaped = curl_easy_escape(curl, word, 0);
|
||||
const char *base = "https://dictionary.cambridge.org/dictionary/english/";
|
||||
char *url = malloc(strlen(base) + strlen(escaped) + 1);
|
||||
if (url) {
|
||||
strcpy(url, base);
|
||||
strcat(url, escaped);
|
||||
}
|
||||
|
||||
curl_free(escaped);
|
||||
curl_easy_cleanup(curl);
|
||||
free(word);
|
||||
return url;
|
||||
}
|
||||
|
||||
InfoBox fetch_dictionary_data(const char *query) {
|
||||
InfoBox info = {NULL, NULL, NULL, NULL};
|
||||
|
||||
char *url = construct_dictionary_url(query);
|
||||
if (!url) return info;
|
||||
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) { free(url); return info; }
|
||||
|
||||
struct MemStruct chunk = {malloc(1), 0};
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk);
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0");
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
|
||||
if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
|
||||
htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
if (doc) {
|
||||
char *word = xpath_text(doc, "//span[@class='hw dhw']");
|
||||
char *pron = xpath_text(doc, "//span[@class='us dpron-i']//span[@class='ipa dipa lpr-2 lpl-1']");
|
||||
char *pos = xpath_text(doc, "//span[@class='pos dpos']");
|
||||
char *def = xpath_text(doc, "(//div[@class='def ddef_d db'])[1]");
|
||||
char *ex = xpath_text(doc, "(//span[@class='eg deg'])[1]");
|
||||
|
||||
if (word && def) {
|
||||
info.title = strdup("Dictionary");
|
||||
info.extract = build_html(word, pron, pos, def, ex);
|
||||
info.thumbnail_url = strdup("/static/dictionary.jpg");
|
||||
info.url = strdup(url);
|
||||
}
|
||||
|
||||
free(word); free(pron); free(pos); free(def); free(ex);
|
||||
xmlFreeDoc(doc);
|
||||
}
|
||||
}
|
||||
|
||||
curl_easy_cleanup(curl);
|
||||
free(chunk.memory);
|
||||
free(url);
|
||||
return info;
|
||||
}
|
||||
10
src/Infobox/Dictionary.h
Normal file
10
src/Infobox/Dictionary.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef DICTIONARY_H
|
||||
#define DICTIONARY_H
|
||||
|
||||
#include "Infobox.h"
|
||||
|
||||
InfoBox fetch_dictionary_data(const char *word);
|
||||
char *construct_dictionary_url(const char *word);
|
||||
int is_dictionary_query(const char *query);
|
||||
|
||||
#endif
|
||||
13
src/Infobox/Infobox.c
Normal file
13
src/Infobox/Infobox.c
Normal file
@@ -0,0 +1,13 @@
|
||||
#include "Infobox.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
void free_infobox(InfoBox *info) {
|
||||
if (info->title)
|
||||
free(info->title);
|
||||
if (info->thumbnail_url)
|
||||
free(info->thumbnail_url);
|
||||
if (info->extract)
|
||||
free(info->extract);
|
||||
if (info->url)
|
||||
free(info->url);
|
||||
}
|
||||
13
src/Infobox/Infobox.h
Normal file
13
src/Infobox/Infobox.h
Normal file
@@ -0,0 +1,13 @@
|
||||
#ifndef INFOBOX_H
|
||||
#define INFOBOX_H
|
||||
|
||||
typedef struct {
|
||||
char *title;
|
||||
char *thumbnail_url;
|
||||
char *extract;
|
||||
char *url;
|
||||
} InfoBox;
|
||||
|
||||
void free_infobox(InfoBox *info);
|
||||
|
||||
#endif
|
||||
165
src/Infobox/Wikipedia.c
Normal file
165
src/Infobox/Wikipedia.c
Normal file
@@ -0,0 +1,165 @@
|
||||
#include "Wikipedia.h"
|
||||
#include <curl/curl.h>
|
||||
#include <libxml/parser.h>
|
||||
#include <libxml/tree.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
struct WikiMemoryStruct {
|
||||
char *memory;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
static void shorten_summary(char **extract_ptr, int max_chars) {
|
||||
if (!extract_ptr || !*extract_ptr) return;
|
||||
|
||||
char *text = *extract_ptr;
|
||||
int len = strlen(text);
|
||||
|
||||
if (len <= max_chars) return;
|
||||
|
||||
int end_pos = max_chars;
|
||||
for (int i = max_chars; i > (max_chars / 2); i--) {
|
||||
if (text[i] == '.' || text[i] == '!' || text[i] == '?') {
|
||||
end_pos = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
char *new_text = (char *)malloc(end_pos + 4);
|
||||
|
||||
if (new_text) {
|
||||
strncpy(new_text, text, end_pos);
|
||||
new_text[end_pos] = '\0';
|
||||
strcat(new_text, "...");
|
||||
free(*extract_ptr);
|
||||
*extract_ptr = new_text;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t WikiWriteMemoryCallback(void *contents, size_t size, size_t nmemb,
|
||||
void *userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
struct WikiMemoryStruct *mem = (struct WikiMemoryStruct *)userp;
|
||||
|
||||
char *ptr = realloc(mem->memory, mem->size + realsize + 1);
|
||||
if (ptr == NULL) {
|
||||
fprintf(stderr, "Not enough memory (realloc returned NULL)\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
mem->memory = ptr;
|
||||
memcpy(&(mem->memory[mem->size]), contents, realsize);
|
||||
mem->size += realsize;
|
||||
mem->memory[mem->size] = 0;
|
||||
|
||||
return realsize;
|
||||
}
|
||||
|
||||
static void extract_wiki_info(xmlNode *node, InfoBox *info) {
|
||||
xmlNode *cur_node = NULL;
|
||||
|
||||
for (cur_node = node; cur_node; cur_node = cur_node->next) {
|
||||
if (cur_node->type == XML_ELEMENT_NODE) {
|
||||
if (strcmp((const char *)cur_node->name, "page") == 0) {
|
||||
xmlChar *title = xmlGetProp(cur_node, (const xmlChar *)"title");
|
||||
if (title) {
|
||||
info->title = strdup((const char *)title);
|
||||
|
||||
const char *base_article_url = "https://en.wikipedia.org/wiki/";
|
||||
char *formatted_title = strdup((const char *)title);
|
||||
for (int i = 0; formatted_title[i]; i++) {
|
||||
if (formatted_title[i] == ' ') formatted_title[i] = '_';
|
||||
}
|
||||
|
||||
info->url =
|
||||
malloc(strlen(base_article_url) + strlen(formatted_title) + 1);
|
||||
if (info->url) {
|
||||
strcpy(info->url, base_article_url);
|
||||
strcat(info->url, formatted_title);
|
||||
}
|
||||
free(formatted_title);
|
||||
xmlFree(title);
|
||||
}
|
||||
}
|
||||
|
||||
if (strcmp((const char *)cur_node->name, "thumbnail") == 0) {
|
||||
xmlChar *source = xmlGetProp(cur_node, (const xmlChar *)"source");
|
||||
if (source) {
|
||||
info->thumbnail_url = strdup((const char *)source);
|
||||
xmlFree(source);
|
||||
}
|
||||
}
|
||||
|
||||
if (strcmp((const char *)cur_node->name, "extract") == 0) {
|
||||
xmlChar *content = xmlNodeGetContent(cur_node);
|
||||
if (content) {
|
||||
info->extract = strdup((const char *)content);
|
||||
|
||||
shorten_summary(&(info->extract), 300);
|
||||
xmlFree(content);
|
||||
}
|
||||
}
|
||||
}
|
||||
extract_wiki_info(cur_node->children, info);
|
||||
}
|
||||
}
|
||||
|
||||
InfoBox fetch_wiki_data(char *api_url) {
|
||||
CURL *curl_handle;
|
||||
CURLcode res;
|
||||
struct WikiMemoryStruct chunk;
|
||||
InfoBox info = {NULL, NULL, NULL, NULL};
|
||||
|
||||
chunk.memory = malloc(1);
|
||||
chunk.size = 0;
|
||||
|
||||
curl_handle = curl_easy_init();
|
||||
|
||||
if (curl_handle) {
|
||||
curl_easy_setopt(curl_handle, CURLOPT_URL, api_url);
|
||||
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION,
|
||||
WikiWriteMemoryCallback);
|
||||
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
|
||||
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
|
||||
|
||||
res = curl_easy_perform(curl_handle);
|
||||
|
||||
if (res == CURLE_OK) {
|
||||
xmlDocPtr doc =
|
||||
xmlReadMemory(chunk.memory, chunk.size, "noname.xml", NULL, 0);
|
||||
if (doc != NULL) {
|
||||
xmlNode *root_element = xmlDocGetRootElement(doc);
|
||||
extract_wiki_info(root_element, &info);
|
||||
xmlFreeDoc(doc);
|
||||
}
|
||||
}
|
||||
|
||||
curl_easy_cleanup(curl_handle);
|
||||
free(chunk.memory);
|
||||
}
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
char *construct_wiki_url(const char *search_term) {
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) return NULL;
|
||||
|
||||
char *escaped_term = curl_easy_escape(curl, search_term, 0);
|
||||
const char *base =
|
||||
"https://en.wikipedia.org/w/"
|
||||
"api.php?action=query&prop=extracts|pageimages&exintro&"
|
||||
"explaintext&pithumbsize=400&format=xml&origin=*&titles=";
|
||||
|
||||
char *full_url = malloc(strlen(base) + strlen(escaped_term) + 1);
|
||||
if (full_url) {
|
||||
strcpy(full_url, base);
|
||||
strcat(full_url, escaped_term);
|
||||
}
|
||||
|
||||
curl_free(escaped_term);
|
||||
curl_easy_cleanup(curl);
|
||||
return full_url;
|
||||
}
|
||||
9
src/Infobox/Wikipedia.h
Normal file
9
src/Infobox/Wikipedia.h
Normal file
@@ -0,0 +1,9 @@
|
||||
#ifndef WIKIPEDIA_H
|
||||
#define WIKIPEDIA_H
|
||||
|
||||
#include "Infobox.h"
|
||||
|
||||
InfoBox fetch_wiki_data(char *api_url);
|
||||
char *construct_wiki_url(const char *search_term);
|
||||
|
||||
#endif
|
||||
49
src/Main.c
Normal file
49
src/Main.c
Normal file
@@ -0,0 +1,49 @@
|
||||
#include <beaker.h>
|
||||
#include <curl/curl.h>
|
||||
#include <libxml/parser.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "Config.h"
|
||||
#include "Routes/Home.h"
|
||||
#include "Routes/Images.h"
|
||||
#include "Routes/Search.h"
|
||||
|
||||
int handle_opensearch(UrlParams *params) {
|
||||
(void)params;
|
||||
serve_static_file_with_mime("opensearch.xml", "application/opensearchdescription+xml");
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main() {
|
||||
LIBXML_TEST_VERSION
|
||||
xmlInitParser();
|
||||
|
||||
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||
|
||||
Config config = {.host = "0.0.0.0", .port = 5000};
|
||||
|
||||
if (load_config("config.ini", &config) != 0) {
|
||||
fprintf(stderr, "Warning: Could not load config file, using defaults\n");
|
||||
}
|
||||
|
||||
set_handler("/", home_handler);
|
||||
set_handler("/opensearch.xml", handle_opensearch);
|
||||
set_handler("/search", results_handler);
|
||||
set_handler("/images", images_handler);
|
||||
|
||||
fprintf(stderr, "Starting Omnisearch on %s:%d\n", config.host, config.port);
|
||||
|
||||
int result = beaker_run(config.host, config.port);
|
||||
|
||||
if (result != 0) {
|
||||
fprintf(stderr, "Error: Beaker server failed to start.\n");
|
||||
curl_global_cleanup();
|
||||
xmlCleanupParser();
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
curl_global_cleanup();
|
||||
xmlCleanupParser();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
14
src/Routes/Home.c
Normal file
14
src/Routes/Home.c
Normal file
@@ -0,0 +1,14 @@
|
||||
#include "Home.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
int home_handler(UrlParams *params) {
|
||||
(void)params;
|
||||
TemplateContext ctx = new_context();
|
||||
char *rendered_html = render_template("home.html", &ctx);
|
||||
send_response(rendered_html);
|
||||
|
||||
free(rendered_html);
|
||||
free_context(&ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
8
src/Routes/Home.h
Normal file
8
src/Routes/Home.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef HOME_H
|
||||
#define HOME_H
|
||||
|
||||
#include <beaker.h>
|
||||
|
||||
int home_handler(UrlParams *params);
|
||||
|
||||
#endif
|
||||
278
src/Routes/Images.c
Normal file
278
src/Routes/Images.c
Normal file
@@ -0,0 +1,278 @@
|
||||
#include "Images.h"
|
||||
#include "../Utility/Unescape.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <libxml/xpath.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
struct MemoryBlock {
|
||||
char *response;
|
||||
size_t size;
|
||||
};
|
||||
|
||||
static size_t ImageWriteCallback(void *data, size_t size, size_t nmemb,
|
||||
void *userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
struct MemoryBlock *mem = (struct MemoryBlock *)userp;
|
||||
char *ptr = (char *)realloc(mem->response, mem->size + realsize + 1);
|
||||
if (ptr == NULL) {
|
||||
return 0;
|
||||
}
|
||||
mem->response = ptr;
|
||||
memcpy(&(mem->response[mem->size]), data, realsize);
|
||||
mem->size += realsize;
|
||||
mem->response[mem->size] = 0;
|
||||
return realsize;
|
||||
}
|
||||
|
||||
static char *fetch_images_html(const char *url) {
|
||||
CURL *curl_handle;
|
||||
struct MemoryBlock chunk = {.response = malloc(1), .size = 0};
|
||||
if (!chunk.response) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
curl_handle = curl_easy_init();
|
||||
if (!curl_handle) {
|
||||
free(chunk.response);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl_handle, CURLOPT_URL, url);
|
||||
curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, ImageWriteCallback);
|
||||
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
|
||||
curl_easy_setopt(
|
||||
curl_handle, CURLOPT_USERAGENT,
|
||||
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
|
||||
curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 10L);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl_handle);
|
||||
if (res != CURLE_OK) {
|
||||
free(chunk.response);
|
||||
curl_easy_cleanup(curl_handle);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
curl_easy_cleanup(curl_handle);
|
||||
return chunk.response;
|
||||
}
|
||||
|
||||
int images_handler(UrlParams *params) {
|
||||
TemplateContext ctx = new_context();
|
||||
char *raw_query = "";
|
||||
|
||||
if (params) {
|
||||
for (int i = 0; i < params->count; i++) {
|
||||
if (strcmp(params->params[i].key, "q") == 0) {
|
||||
raw_query = params->params[i].value;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
char *display_query = url_decode_query(raw_query);
|
||||
context_set(&ctx, "query", display_query);
|
||||
|
||||
if (!raw_query || strlen(raw_query) == 0) {
|
||||
send_response("<h1>No query provided</h1>");
|
||||
if (display_query) free(display_query);
|
||||
free_context(&ctx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
CURL *tmp = curl_easy_init();
|
||||
if (!tmp) {
|
||||
send_response("<h1>Error initializing curl</h1>");
|
||||
if (display_query) free(display_query);
|
||||
free_context(&ctx);
|
||||
return -1;
|
||||
}
|
||||
char *encoded_query = curl_easy_escape(tmp, raw_query, 0);
|
||||
curl_easy_cleanup(tmp);
|
||||
|
||||
if (!encoded_query) {
|
||||
send_response("<h1>Error encoding query</h1>");
|
||||
if (display_query) free(display_query);
|
||||
free_context(&ctx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char url[1024];
|
||||
snprintf(url, sizeof(url),
|
||||
"https://www.bing.com/images/search?q=%s", encoded_query);
|
||||
fprintf(stderr, "[DEBUG] Fetching URL: %s\n", url);
|
||||
|
||||
char *html = fetch_images_html(url);
|
||||
if (!html) {
|
||||
fprintf(stderr, "[DEBUG] Failed to fetch HTML\n");
|
||||
send_response("<h1>Error fetching images</h1>");
|
||||
free(encoded_query);
|
||||
free(display_query);
|
||||
free_context(&ctx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
htmlDocPtr doc = htmlReadMemory(html, (int)strlen(html), NULL, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR);
|
||||
if (!doc) {
|
||||
free(html);
|
||||
free(encoded_query);
|
||||
free(display_query);
|
||||
free_context(&ctx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
||||
|
||||
if (!xpathCtx) {
|
||||
xmlFreeDoc(doc);
|
||||
free(html);
|
||||
free(encoded_query);
|
||||
free(display_query);
|
||||
free_context(&ctx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
xmlXPathObjectPtr xpathObj =
|
||||
xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx);
|
||||
|
||||
int image_count = 0;
|
||||
char ***image_matrix = NULL;
|
||||
int *inner_counts = NULL;
|
||||
|
||||
if (xpathObj && xpathObj->nodesetval) {
|
||||
int nodes = xpathObj->nodesetval->nodeNr;
|
||||
fprintf(stderr, "[DEBUG] Found %d image items\n", nodes);
|
||||
|
||||
int max_images = (nodes < 32) ? nodes : 32;
|
||||
image_matrix = malloc(sizeof(char **) * max_images);
|
||||
inner_counts = malloc(sizeof(int) * max_images);
|
||||
|
||||
for (int i = 0; i < nodes; i++) {
|
||||
if (image_count >= 32) break;
|
||||
|
||||
xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
|
||||
xmlNodePtr img_node = NULL;
|
||||
xmlNodePtr tit_node = NULL;
|
||||
xmlNodePtr des_node = NULL;
|
||||
xmlNodePtr thumb_link = NULL;
|
||||
|
||||
for (xmlNodePtr child = node->children; child; child = child->next) {
|
||||
if (child->type != XML_ELEMENT_NODE) continue;
|
||||
|
||||
if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) {
|
||||
xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
|
||||
if (class) {
|
||||
if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) {
|
||||
thumb_link = child;
|
||||
for (xmlNodePtr thumb_child = child->children; thumb_child; thumb_child = thumb_child->next) {
|
||||
if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) {
|
||||
xmlChar *div_class = xmlGetProp(thumb_child, (const xmlChar *)"class");
|
||||
if (div_class && xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) {
|
||||
for (xmlNodePtr cico_child = thumb_child->children; cico_child; cico_child = cico_child->next) {
|
||||
if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") == 0) {
|
||||
img_node = cico_child;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (div_class) xmlFree(div_class);
|
||||
}
|
||||
}
|
||||
} else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) {
|
||||
tit_node = child;
|
||||
}
|
||||
xmlFree(class);
|
||||
}
|
||||
} else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) {
|
||||
xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
|
||||
if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) {
|
||||
for (xmlNodePtr meta_child = child->children; meta_child; meta_child = meta_child->next) {
|
||||
if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) {
|
||||
xmlChar *div_class = xmlGetProp(meta_child, (const xmlChar *)"class");
|
||||
if (div_class) {
|
||||
if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) {
|
||||
des_node = meta_child;
|
||||
}
|
||||
xmlFree(div_class);
|
||||
}
|
||||
} else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) {
|
||||
xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class");
|
||||
if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) {
|
||||
tit_node = meta_child;
|
||||
}
|
||||
if (a_class) xmlFree(a_class);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (class) xmlFree(class);
|
||||
}
|
||||
}
|
||||
|
||||
xmlChar *iurl = img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL;
|
||||
xmlChar *full_url = thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL;
|
||||
xmlChar *title = des_node ? xmlNodeGetContent(des_node) : (tit_node ? xmlNodeGetContent(tit_node) : NULL);
|
||||
xmlChar *rurl = tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL;
|
||||
|
||||
fprintf(stderr, "[DEBUG] Image %d: thumb=%s, full=%s, title=%s, site=%s\n",
|
||||
image_count, iurl ? (char *)iurl : "nil",
|
||||
full_url ? (char *)full_url : "nil",
|
||||
title ? (char *)title : "nil",
|
||||
rurl ? (char *)rurl : "nil");
|
||||
|
||||
if (iurl && strlen((char *)iurl) > 0) {
|
||||
image_matrix[image_count] = malloc(sizeof(char *) * 4);
|
||||
image_matrix[image_count][0] = strdup((char *)iurl);
|
||||
image_matrix[image_count][1] = strdup(title ? (char *)title : "Image");
|
||||
image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#");
|
||||
image_matrix[image_count][3] = strdup(full_url ? (char *)full_url : "#");
|
||||
inner_counts[image_count] = 4;
|
||||
image_count++;
|
||||
}
|
||||
|
||||
if (iurl) xmlFree(iurl);
|
||||
if (title) xmlFree(title);
|
||||
if (rurl) xmlFree(rurl);
|
||||
if (full_url) xmlFree(full_url);
|
||||
}
|
||||
}
|
||||
|
||||
context_set_array_of_arrays(&ctx, "images", image_matrix, image_count,
|
||||
inner_counts);
|
||||
|
||||
char *rendered = render_template("images.html", &ctx);
|
||||
if (rendered) {
|
||||
send_response(rendered);
|
||||
free(rendered);
|
||||
} else {
|
||||
send_response("<h1>Error rendering image results</h1>");
|
||||
}
|
||||
|
||||
if (image_matrix) {
|
||||
for (int i = 0; i < image_count; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
free(image_matrix[i][j]);
|
||||
}
|
||||
free(image_matrix[i]);
|
||||
}
|
||||
free(image_matrix);
|
||||
}
|
||||
if (inner_counts) {
|
||||
free(inner_counts);
|
||||
}
|
||||
|
||||
if (xpathObj) xmlXPathFreeObject(xpathObj);
|
||||
if (xpathCtx) xmlXPathFreeContext(xpathCtx);
|
||||
if (doc) xmlFreeDoc(doc);
|
||||
free(html);
|
||||
curl_free(encoded_query);
|
||||
free(display_query);
|
||||
free_context(&ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
8
src/Routes/Images.h
Normal file
8
src/Routes/Images.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef IMAGES_HANDLER_H
|
||||
#define IMAGES_HANDLER_H
|
||||
|
||||
#include <beaker.h>
|
||||
|
||||
int images_handler(UrlParams *params);
|
||||
|
||||
#endif
|
||||
275
src/Routes/Search.c
Normal file
275
src/Routes/Search.c
Normal file
@@ -0,0 +1,275 @@
|
||||
#include "Search.h"
|
||||
#include "../Infobox/Wikipedia.h"
|
||||
#include "../Infobox/Calculator.h"
|
||||
#include "../Infobox/Dictionary.h"
|
||||
#include "../Scraping/Scraping.h"
|
||||
#include "../Utility/Display.h"
|
||||
#include "../Utility/Unescape.h"
|
||||
#include <ctype.h>
|
||||
#include <pthread.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
typedef struct {
|
||||
const char *query;
|
||||
InfoBox result;
|
||||
int success;
|
||||
} InfoBoxThreadData;
|
||||
|
||||
static void *wiki_thread_func(void *arg) {
|
||||
InfoBoxThreadData *data = (InfoBoxThreadData *)arg;
|
||||
char *dynamic_url = construct_wiki_url(data->query);
|
||||
if (dynamic_url) {
|
||||
data->result = fetch_wiki_data(dynamic_url);
|
||||
data->success =
|
||||
(data->result.title != NULL && data->result.extract != NULL &&
|
||||
strlen(data->result.extract) > 10);
|
||||
free(dynamic_url);
|
||||
} else {
|
||||
data->success = 0;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int is_calculator_query(const char *query) {
|
||||
if (!query) return 0;
|
||||
|
||||
int has_digit = 0;
|
||||
int has_operator = 0;
|
||||
|
||||
for (const char *p = query; *p; p++) {
|
||||
if (isdigit(*p) || *p == '.') {
|
||||
has_digit = 1;
|
||||
}
|
||||
if (*p == '+' || *p == '-' || *p == '*' || *p == '/' || *p == '=' ||
|
||||
*p == '^') {
|
||||
has_operator = 1;
|
||||
}
|
||||
}
|
||||
|
||||
return has_digit && (has_operator || strchr(query, '.'));
|
||||
}
|
||||
|
||||
static void *calc_thread_func(void *arg) {
|
||||
InfoBoxThreadData *data = (InfoBoxThreadData *)arg;
|
||||
|
||||
if (is_calculator_query(data->query)) {
|
||||
data->result = fetch_calc_data((char *)data->query);
|
||||
data->success =
|
||||
(data->result.title != NULL && data->result.extract != NULL);
|
||||
} else {
|
||||
data->success = 0;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void *dict_thread_func(void *arg) {
|
||||
InfoBoxThreadData *data = (InfoBoxThreadData *)arg;
|
||||
|
||||
if (is_dictionary_query(data->query)) {
|
||||
data->result = fetch_dictionary_data(data->query);
|
||||
data->success =
|
||||
(data->result.title != NULL && data->result.extract != NULL);
|
||||
} else {
|
||||
data->success = 0;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int add_infobox_to_collection(InfoBox *infobox, char ****collection,
|
||||
int **inner_counts, int current_count) {
|
||||
*collection =
|
||||
(char ***)realloc(*collection, sizeof(char **) * (current_count + 1));
|
||||
*inner_counts =
|
||||
(int *)realloc(*inner_counts, sizeof(int) * (current_count + 1));
|
||||
|
||||
(*collection)[current_count] = (char **)malloc(sizeof(char *) * 4);
|
||||
(*collection)[current_count][0] = infobox->title;
|
||||
(*collection)[current_count][1] = infobox->thumbnail_url;
|
||||
(*collection)[current_count][2] = infobox->extract;
|
||||
(*collection)[current_count][3] = infobox->url;
|
||||
(*inner_counts)[current_count] = 4;
|
||||
|
||||
return current_count + 1;
|
||||
}
|
||||
|
||||
int results_handler(UrlParams *params) {
|
||||
TemplateContext ctx = new_context();
|
||||
char *raw_query = "";
|
||||
int page = 1;
|
||||
|
||||
if (params) {
|
||||
for (int i = 0; i < params->count; i++) {
|
||||
if (strcmp(params->params[i].key, "q") == 0) {
|
||||
raw_query = params->params[i].value;
|
||||
} else if (strcmp(params->params[i].key, "p") == 0) {
|
||||
int parsed = atoi(params->params[i].value);
|
||||
if (parsed > 1) page = parsed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
context_set(&ctx, "query", raw_query);
|
||||
|
||||
char page_str[16], prev_str[16], next_str[16];
|
||||
snprintf(page_str, sizeof(page_str), "%d", page);
|
||||
snprintf(prev_str, sizeof(prev_str), "%d", page > 1 ? page - 1 : 0);
|
||||
snprintf(next_str, sizeof(next_str), "%d", page + 1);
|
||||
context_set(&ctx, "page", page_str);
|
||||
context_set(&ctx, "prev_page", prev_str);
|
||||
context_set(&ctx, "next_page", next_str);
|
||||
|
||||
if (!raw_query || strlen(raw_query) == 0) {
|
||||
send_response("<h1>No query provided</h1>");
|
||||
free_context(&ctx);
|
||||
return -1;
|
||||
}
|
||||
|
||||
pthread_t wiki_tid, calc_tid, dict_tid;
|
||||
InfoBoxThreadData wiki_data = {.query = raw_query, .success = 0};
|
||||
InfoBoxThreadData calc_data = {.query = raw_query, .success = 0};
|
||||
InfoBoxThreadData dict_data = {.query = raw_query, .success = 0};
|
||||
|
||||
if (page == 1) {
|
||||
pthread_create(&wiki_tid, NULL, wiki_thread_func, &wiki_data);
|
||||
pthread_create(&calc_tid, NULL, calc_thread_func, &calc_data);
|
||||
pthread_create(&dict_tid, NULL, dict_thread_func, &dict_data);
|
||||
}
|
||||
|
||||
ScrapeJob jobs[ENGINE_COUNT];
|
||||
SearchResult *all_results[ENGINE_COUNT];
|
||||
|
||||
for (int i = 0; i < ENGINE_COUNT; i++) {
|
||||
all_results[i] = NULL;
|
||||
jobs[i].engine = &ENGINE_REGISTRY[i];
|
||||
jobs[i].query = raw_query;
|
||||
jobs[i].out_results = &all_results[i];
|
||||
jobs[i].max_results = 10;
|
||||
jobs[i].results_count = 0;
|
||||
jobs[i].page = page;
|
||||
}
|
||||
|
||||
scrape_engines_parallel(jobs, ENGINE_COUNT);
|
||||
|
||||
if (page == 1) {
|
||||
pthread_join(wiki_tid, NULL);
|
||||
pthread_join(calc_tid, NULL);
|
||||
pthread_join(dict_tid, NULL);
|
||||
}
|
||||
|
||||
char ***infobox_matrix = NULL;
|
||||
int *infobox_inner_counts = NULL;
|
||||
int infobox_count = 0;
|
||||
|
||||
if (page == 1) {
|
||||
if (dict_data.success) {
|
||||
infobox_count = add_infobox_to_collection(&dict_data.result, &infobox_matrix,
|
||||
&infobox_inner_counts, infobox_count);
|
||||
}
|
||||
|
||||
if (calc_data.success) {
|
||||
infobox_count = add_infobox_to_collection(&calc_data.result, &infobox_matrix,
|
||||
&infobox_inner_counts, infobox_count);
|
||||
}
|
||||
|
||||
if (wiki_data.success) {
|
||||
infobox_count = add_infobox_to_collection(&wiki_data.result, &infobox_matrix,
|
||||
&infobox_inner_counts, infobox_count);
|
||||
}
|
||||
}
|
||||
|
||||
if (infobox_count > 0) {
|
||||
context_set_array_of_arrays(&ctx, "infoboxes", infobox_matrix,
|
||||
infobox_count, infobox_inner_counts);
|
||||
free(infobox_matrix);
|
||||
free(infobox_inner_counts);
|
||||
}
|
||||
|
||||
int total_results = 0;
|
||||
for (int i = 0; i < ENGINE_COUNT; i++) {
|
||||
total_results += jobs[i].results_count;
|
||||
}
|
||||
|
||||
if (total_results > 0) {
|
||||
char ***results_matrix = (char ***)malloc(sizeof(char **) * total_results);
|
||||
int *results_inner_counts = (int *)malloc(sizeof(int) * total_results);
|
||||
char **seen_urls = (char **)malloc(sizeof(char *) * total_results);
|
||||
int unique_count = 0;
|
||||
|
||||
for (int i = 0; i < ENGINE_COUNT; i++) {
|
||||
for (int j = 0; j < jobs[i].results_count; j++) {
|
||||
char *display_url = all_results[i][j].url;
|
||||
|
||||
int is_duplicate = 0;
|
||||
for (int k = 0; k < unique_count; k++) {
|
||||
if (strcmp(seen_urls[k], display_url) == 0) {
|
||||
is_duplicate = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_duplicate) {
|
||||
free(all_results[i][j].url);
|
||||
free(all_results[i][j].title);
|
||||
free(all_results[i][j].snippet);
|
||||
continue;
|
||||
}
|
||||
|
||||
seen_urls[unique_count] = strdup(display_url);
|
||||
results_matrix[unique_count] = (char **)malloc(sizeof(char *) * 4);
|
||||
char *pretty_url = pretty_display_url(display_url);
|
||||
|
||||
results_matrix[unique_count][0] = strdup(display_url);
|
||||
results_matrix[unique_count][1] = strdup(pretty_url);
|
||||
results_matrix[unique_count][2] = all_results[i][j].title ? strdup(all_results[i][j].title) : strdup("Untitled");
|
||||
results_matrix[unique_count][3] = all_results[i][j].snippet ? strdup(all_results[i][j].snippet) : strdup("");
|
||||
|
||||
results_inner_counts[unique_count] = 4;
|
||||
|
||||
free(pretty_url);
|
||||
free(all_results[i][j].url);
|
||||
free(all_results[i][j].title);
|
||||
free(all_results[i][j].snippet);
|
||||
|
||||
unique_count++;
|
||||
}
|
||||
free(all_results[i]);
|
||||
}
|
||||
|
||||
context_set_array_of_arrays(&ctx, "results", results_matrix, unique_count, results_inner_counts);
|
||||
|
||||
char *html = render_template("results.html", &ctx);
|
||||
if (html) {
|
||||
send_response(html);
|
||||
free(html);
|
||||
}
|
||||
|
||||
for (int i = 0; i < unique_count; i++) {
|
||||
for (int j = 0; j < 4; j++) free(results_matrix[i][j]);
|
||||
free(results_matrix[i]);
|
||||
free(seen_urls[i]);
|
||||
}
|
||||
free(seen_urls);
|
||||
free(results_matrix);
|
||||
free(results_inner_counts);
|
||||
} else {
|
||||
char *html = render_template("results.html", &ctx);
|
||||
if (html) {
|
||||
send_response(html);
|
||||
free(html);
|
||||
}
|
||||
}
|
||||
|
||||
if (page == 1) {
|
||||
if (wiki_data.success) free_infobox(&wiki_data.result);
|
||||
if (calc_data.success) free_infobox(&calc_data.result);
|
||||
if (dict_data.success) free_infobox(&dict_data.result);
|
||||
}
|
||||
free_context(&ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
8
src/Routes/Search.h
Normal file
8
src/Routes/Search.h
Normal file
@@ -0,0 +1,8 @@
|
||||
#ifndef SEARCH_HANDLER_H
|
||||
#define SEARCH_HANDLER_H
|
||||
|
||||
#include <beaker.h>
|
||||
|
||||
int results_handler(UrlParams *params);
|
||||
|
||||
#endif
|
||||
459
src/Scraping/Scraping.c
Normal file
459
src/Scraping/Scraping.c
Normal file
@@ -0,0 +1,459 @@
|
||||
#include "Scraping.h"
|
||||
#include "../Utility/Unescape.h"
|
||||
#include <curl/curl.h>
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <libxml/xpath.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
|
||||
static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb,
|
||||
void *userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
MemoryBuffer *mem = (MemoryBuffer *)userp;
|
||||
|
||||
if (mem->size + realsize + 1 > mem->capacity) {
|
||||
|
||||
size_t new_cap = mem->capacity == 0 ? 16384 : mem->capacity * 2;
|
||||
while (new_cap < mem->size + realsize + 1) new_cap *= 2;
|
||||
|
||||
char *ptr = (char *)realloc(mem->memory, new_cap);
|
||||
if (!ptr) {
|
||||
return 0;
|
||||
}
|
||||
mem->memory = ptr;
|
||||
mem->capacity = new_cap;
|
||||
}
|
||||
|
||||
memcpy(&(mem->memory[mem->size]), contents, realsize);
|
||||
mem->size += realsize;
|
||||
mem->memory[mem->size] = 0;
|
||||
|
||||
return realsize;
|
||||
}
|
||||
|
||||
static const char *get_random_user_agent() {
|
||||
static const char *agents[] = {
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
|
||||
"like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like "
|
||||
"Gecko) "
|
||||
"Chrome/120.0.0.0` Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 "
|
||||
"Firefox/121.0",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 "
|
||||
"(KHTML, like Gecko) Version/17.2 Safari/605.1.15"};
|
||||
return agents[rand() % 5];
|
||||
}
|
||||
|
||||
static int parse_ddg_lite(const char *engine_name, xmlDocPtr doc,
|
||||
SearchResult **out_results, int max_results) {
|
||||
(void)engine_name;
|
||||
int found_count = 0;
|
||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
||||
if (!xpathCtx) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *link_xpath = "//tr[not(contains(@class, 'result-sponsored'))]//a[@class='result-link']";
|
||||
xmlXPathObjectPtr xpathObj =
|
||||
xmlXPathEvalExpression((xmlChar *)link_xpath, xpathCtx);
|
||||
|
||||
if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
|
||||
if (xpathObj) xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int num_links = xpathObj->nodesetval->nodeNr;
|
||||
|
||||
int actual_alloc = (num_links < max_results) ? num_links : max_results;
|
||||
*out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
|
||||
if (!*out_results) {
|
||||
xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_links && found_count < max_results; i++) {
|
||||
xmlNodePtr linkNode = xpathObj->nodesetval->nodeTab[i];
|
||||
char *title = (char *)xmlNodeGetContent(linkNode);
|
||||
char *url = (char *)xmlGetProp(linkNode, (xmlChar *)"href");
|
||||
char *snippet_text = NULL;
|
||||
|
||||
xmlNodePtr current = linkNode->parent;
|
||||
while (current && xmlStrcasecmp(current->name, (const xmlChar *)"tr") != 0)
|
||||
current = current->parent;
|
||||
|
||||
if (current && current->next) {
|
||||
xmlNodePtr snippetRow = current->next;
|
||||
while (snippetRow &&
|
||||
xmlStrcasecmp(snippetRow->name, (const xmlChar *)"tr") != 0)
|
||||
snippetRow = snippetRow->next;
|
||||
if (snippetRow) {
|
||||
|
||||
xpathCtx->node = snippetRow;
|
||||
xmlXPathObjectPtr sObj = xmlXPathEvalExpression(
|
||||
(xmlChar *)".//td[@class='result-snippet']", xpathCtx);
|
||||
if (sObj && sObj->nodesetval && sObj->nodesetval->nodeNr > 0) {
|
||||
snippet_text = (char *)xmlNodeGetContent(sObj->nodesetval->nodeTab[0]);
|
||||
}
|
||||
if (sObj) xmlXPathFreeObject(sObj);
|
||||
xpathCtx->node = NULL;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
(*out_results)[found_count].url = unescape_search_url(url);
|
||||
(*out_results)[found_count].title = strdup(title ? title : "No Title");
|
||||
(*out_results)[found_count].snippet = strdup(snippet_text ? snippet_text : "");
|
||||
|
||||
found_count++;
|
||||
|
||||
if (title) xmlFree(title);
|
||||
if (url) xmlFree(url);
|
||||
if (snippet_text) xmlFree(snippet_text);
|
||||
}
|
||||
|
||||
xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return found_count;
|
||||
}
|
||||
|
||||
static int parse_startpage(const char *engine_name, xmlDocPtr doc,
|
||||
SearchResult **out_results, int max_results) {
|
||||
(void)engine_name;
|
||||
int found_count = 0;
|
||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
||||
if (!xpathCtx) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *container_xpath = "//div[contains(@class, 'result')]";
|
||||
xmlXPathObjectPtr xpathObj =
|
||||
xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx);
|
||||
|
||||
if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
|
||||
if (xpathObj) xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int num_results = xpathObj->nodesetval->nodeNr;
|
||||
|
||||
int actual_alloc = (num_results < max_results) ? num_results : max_results;
|
||||
*out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
|
||||
if (!*out_results) {
|
||||
xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_results && found_count < max_results; i++) {
|
||||
xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
|
||||
xpathCtx->node = resultNode;
|
||||
|
||||
xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
|
||||
(xmlChar *)".//a[contains(@class, 'result-link')]", xpathCtx);
|
||||
char *url =
|
||||
(linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
|
||||
? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
|
||||
(xmlChar *)"href")
|
||||
: NULL;
|
||||
|
||||
xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
|
||||
(xmlChar *)".//h2[contains(@class, 'wgl-title')]", xpathCtx);
|
||||
char *title =
|
||||
(titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
|
||||
? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
|
||||
: NULL;
|
||||
|
||||
xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
|
||||
(xmlChar *)".//p[contains(@class, 'description')]", xpathCtx);
|
||||
char *snippet_text =
|
||||
(snippetObj && snippetObj->nodesetval &&
|
||||
snippetObj->nodesetval->nodeNr > 0)
|
||||
? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
|
||||
: NULL;
|
||||
|
||||
if (url && title) {
|
||||
(*out_results)[found_count].url = strdup(url);
|
||||
(*out_results)[found_count].title = strdup(title);
|
||||
(*out_results)[found_count].snippet =
|
||||
strdup(snippet_text ? snippet_text : "");
|
||||
found_count++;
|
||||
}
|
||||
|
||||
if (title) xmlFree(title);
|
||||
if (url) xmlFree(url);
|
||||
if (snippet_text) xmlFree(snippet_text);
|
||||
if (linkObj) xmlXPathFreeObject(linkObj);
|
||||
if (titleObj) xmlXPathFreeObject(titleObj);
|
||||
if (snippetObj) xmlXPathFreeObject(snippetObj);
|
||||
}
|
||||
|
||||
xpathCtx->node = NULL;
|
||||
|
||||
xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return found_count;
|
||||
}
|
||||
|
||||
static int parse_yahoo(const char *engine_name, xmlDocPtr doc,
|
||||
SearchResult **out_results, int max_results) {
|
||||
(void)engine_name;
|
||||
int found_count = 0;
|
||||
xmlXPathContextPtr xpathCtx = xmlXPathNewContext(doc);
|
||||
if (!xpathCtx) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *container_xpath = "//div[contains(@class, 'algo-sr')]";
|
||||
xmlXPathObjectPtr xpathObj =
|
||||
xmlXPathEvalExpression((xmlChar *)container_xpath, xpathCtx);
|
||||
|
||||
if (!xpathObj || !xpathObj->nodesetval || xpathObj->nodesetval->nodeNr == 0) {
|
||||
if (xpathObj) xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int num_results = xpathObj->nodesetval->nodeNr;
|
||||
|
||||
int actual_alloc = (num_results < max_results) ? num_results : max_results;
|
||||
*out_results = (SearchResult *)calloc(actual_alloc, sizeof(SearchResult));
|
||||
if (!*out_results) {
|
||||
xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_results && found_count < max_results; i++) {
|
||||
xmlNodePtr resultNode = xpathObj->nodesetval->nodeTab[i];
|
||||
xpathCtx->node = resultNode;
|
||||
|
||||
xmlXPathObjectPtr linkObj = xmlXPathEvalExpression(
|
||||
(xmlChar *)".//div[contains(@class, 'compTitle')]//a[@target='_blank']",
|
||||
xpathCtx);
|
||||
char *url =
|
||||
(linkObj && linkObj->nodesetval && linkObj->nodesetval->nodeNr > 0)
|
||||
? (char *)xmlGetProp(linkObj->nodesetval->nodeTab[0],
|
||||
(xmlChar *)"href")
|
||||
: NULL;
|
||||
|
||||
xmlXPathObjectPtr titleObj = xmlXPathEvalExpression(
|
||||
(xmlChar *)".//h3[contains(@class, 'title')]", xpathCtx);
|
||||
char *title =
|
||||
(titleObj && titleObj->nodesetval && titleObj->nodesetval->nodeNr > 0)
|
||||
? (char *)xmlNodeGetContent(titleObj->nodesetval->nodeTab[0])
|
||||
: NULL;
|
||||
|
||||
xmlXPathObjectPtr snippetObj = xmlXPathEvalExpression(
|
||||
(xmlChar *)".//div[contains(@class, 'compText')]//p", xpathCtx);
|
||||
char *snippet_text =
|
||||
(snippetObj && snippetObj->nodesetval &&
|
||||
snippetObj->nodesetval->nodeNr > 0)
|
||||
? (char *)xmlNodeGetContent(snippetObj->nodesetval->nodeTab[0])
|
||||
: NULL;
|
||||
|
||||
if (url && title) {
|
||||
(*out_results)[found_count].url = unescape_search_url(url);
|
||||
(*out_results)[found_count].title = strdup(title);
|
||||
(*out_results)[found_count].snippet =
|
||||
strdup(snippet_text ? snippet_text : "");
|
||||
found_count++;
|
||||
}
|
||||
|
||||
if (title) xmlFree(title);
|
||||
if (url) xmlFree(url);
|
||||
if (snippet_text) xmlFree(snippet_text);
|
||||
if (linkObj) xmlXPathFreeObject(linkObj);
|
||||
if (titleObj) xmlXPathFreeObject(titleObj);
|
||||
if (snippetObj) xmlXPathFreeObject(snippetObj);
|
||||
}
|
||||
|
||||
xpathCtx->node = NULL;
|
||||
xmlXPathFreeObject(xpathObj);
|
||||
xmlXPathFreeContext(xpathCtx);
|
||||
return found_count;
|
||||
}
|
||||
|
||||
const SearchEngine ENGINE_REGISTRY[] = {
|
||||
{.name = "DuckDuckGo Lite",
|
||||
.base_url = "https://lite.duckduckgo.com/lite/?q=",
|
||||
.host_header = "lite.duckduckgo.com",
|
||||
.referer = "https://lite.duckduckgo.com/",
|
||||
.page_param = "s",
|
||||
.page_multiplier = 30,
|
||||
.page_base = 0,
|
||||
.parser = parse_ddg_lite},
|
||||
{.name = "Startpage",
|
||||
.base_url = "https://www.startpage.com/sp/search?q=",
|
||||
.host_header = "www.startpage.com",
|
||||
.referer = "https://www.startpage.com/",
|
||||
.page_param = "page",
|
||||
.page_multiplier = 1,
|
||||
.page_base = 1,
|
||||
.parser = parse_startpage},
|
||||
{.name = "Yahoo",
|
||||
.base_url = "https://search.yahoo.com/search?p=",
|
||||
.host_header = "search.yahoo.com",
|
||||
.referer = "https://search.yahoo.com/",
|
||||
.page_param = "b",
|
||||
.page_multiplier = 10,
|
||||
.page_base = 1,
|
||||
.parser = parse_yahoo}};
|
||||
|
||||
const int ENGINE_COUNT = sizeof(ENGINE_REGISTRY) / sizeof(SearchEngine);
|
||||
|
||||
static void configure_curl_handle(CURL *curl, const char *full_url,
|
||||
MemoryBuffer *chunk,
|
||||
struct curl_slist *headers) {
|
||||
curl_easy_setopt(curl, CURLOPT_URL, full_url);
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)chunk);
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, get_random_user_agent());
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, 300L);
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
|
||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
|
||||
}
|
||||
|
||||
int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
||||
CURLM *multi_handle = curl_multi_init();
|
||||
if (!multi_handle) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (int i = 0; i < num_jobs; i++) {
|
||||
ScrapeJob *job = &jobs[i];
|
||||
job->handle = curl_easy_init();
|
||||
if (!job->handle) {
|
||||
continue;
|
||||
}
|
||||
|
||||
job->response.memory = (char *)malloc(16384);
|
||||
job->response.size = 0;
|
||||
job->response.capacity = 16384;
|
||||
|
||||
char full_url[1024];
|
||||
char *encoded_query = curl_easy_escape(job->handle, job->query, 0);
|
||||
if (!encoded_query) {
|
||||
curl_easy_cleanup(job->handle);
|
||||
job->handle = NULL;
|
||||
continue;
|
||||
}
|
||||
|
||||
int page = (job->page < 1) ? 1 : job->page;
|
||||
int page_value = (page - 1) * job->engine->page_multiplier + job->engine->page_base;
|
||||
|
||||
snprintf(full_url, sizeof(full_url), "%s%s&%s=%d",
|
||||
job->engine->base_url,
|
||||
encoded_query,
|
||||
job->engine->page_param,
|
||||
page_value);
|
||||
curl_free(encoded_query);
|
||||
|
||||
struct curl_slist *headers = NULL;
|
||||
char host_buf[256], ref_buf[256];
|
||||
snprintf(host_buf, sizeof(host_buf), "Host: %s", job->engine->host_header);
|
||||
snprintf(ref_buf, sizeof(ref_buf), "Referer: %s", job->engine->referer);
|
||||
headers = curl_slist_append(headers, host_buf);
|
||||
headers = curl_slist_append(headers, ref_buf);
|
||||
headers = curl_slist_append(headers, "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
||||
headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
|
||||
headers = curl_slist_append(headers, "DNT: 1");
|
||||
|
||||
configure_curl_handle(job->handle, full_url, &job->response, headers);
|
||||
|
||||
curl_easy_setopt(job->handle, CURLOPT_PRIVATE, headers);
|
||||
|
||||
curl_multi_add_handle(multi_handle, job->handle);
|
||||
}
|
||||
|
||||
usleep(100000 + (rand() % 100000));
|
||||
|
||||
int still_running = 0;
|
||||
curl_multi_perform(multi_handle, &still_running);
|
||||
|
||||
do {
|
||||
int numfds = 0;
|
||||
CURLMcode mc = curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
|
||||
|
||||
if (mc != CURLM_OK) {
|
||||
break;
|
||||
}
|
||||
|
||||
curl_multi_perform(multi_handle, &still_running);
|
||||
} while (still_running);
|
||||
|
||||
CURLMsg *msg;
|
||||
int msgs_left;
|
||||
while ((msg = curl_multi_info_read(multi_handle, &msgs_left))) {
|
||||
if (msg->msg == CURLMSG_DONE) {
|
||||
CURL *handle = msg->easy_handle;
|
||||
|
||||
for (int i = 0; i < num_jobs; i++) {
|
||||
if (jobs[i].handle == handle) {
|
||||
ScrapeJob *job = &jobs[i];
|
||||
|
||||
long response_code;
|
||||
curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &response_code);
|
||||
|
||||
if (msg->data.result == CURLE_OK && job->response.size > 0) {
|
||||
xmlDocPtr doc = htmlReadMemory(
|
||||
job->response.memory, job->response.size, NULL, NULL,
|
||||
HTML_PARSE_RECOVER | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING);
|
||||
|
||||
if (doc) {
|
||||
job->results_count = job->engine->parser(
|
||||
job->engine->name, doc, job->out_results, job->max_results);
|
||||
xmlFreeDoc(doc);
|
||||
}
|
||||
} else {
|
||||
job->results_count = 0;
|
||||
}
|
||||
|
||||
struct curl_slist *headers;
|
||||
curl_easy_getinfo(handle, CURLINFO_PRIVATE, &headers);
|
||||
if (headers) curl_slist_free_all(headers);
|
||||
|
||||
free(job->response.memory);
|
||||
curl_multi_remove_handle(multi_handle, handle);
|
||||
curl_easy_cleanup(handle);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
curl_multi_cleanup(multi_handle);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int scrape_engine(const SearchEngine *engine, const char *query,
|
||||
SearchResult **out_results, int max_results) {
|
||||
ScrapeJob job = {
|
||||
.engine = engine,
|
||||
.query = (char *)query,
|
||||
.out_results = out_results,
|
||||
.max_results = max_results,
|
||||
.results_count = 0,
|
||||
.page = 1
|
||||
};
|
||||
|
||||
scrape_engines_parallel(&job, 1);
|
||||
return job.results_count;
|
||||
}
|
||||
58
src/Scraping/Scraping.h
Normal file
58
src/Scraping/Scraping.h
Normal file
@@ -0,0 +1,58 @@
|
||||
#ifndef SCRAPING_H
|
||||
#define SCRAPING_H
|
||||
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <curl/curl.h>
|
||||
|
||||
#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__)
|
||||
#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__)
|
||||
#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__)
|
||||
#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__)
|
||||
|
||||
typedef struct {
|
||||
char *url;
|
||||
char *title;
|
||||
char *snippet;
|
||||
} SearchResult;
|
||||
|
||||
typedef int (*ParserFunc)(const char *engine_name, xmlDocPtr doc,
|
||||
SearchResult **out_results, int max_results);
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
const char *base_url;
|
||||
const char *host_header;
|
||||
const char *referer;
|
||||
|
||||
const char *page_param;
|
||||
int page_multiplier;
|
||||
int page_base;
|
||||
ParserFunc parser;
|
||||
} SearchEngine;
|
||||
|
||||
typedef struct {
|
||||
char *memory;
|
||||
size_t size;
|
||||
size_t capacity;
|
||||
} MemoryBuffer;
|
||||
|
||||
typedef struct {
|
||||
const SearchEngine *engine;
|
||||
char *query;
|
||||
SearchResult **out_results;
|
||||
int max_results;
|
||||
int page;
|
||||
CURL *handle;
|
||||
MemoryBuffer response;
|
||||
int results_count;
|
||||
} ScrapeJob;
|
||||
|
||||
extern const SearchEngine ENGINE_REGISTRY[];
|
||||
extern const int ENGINE_COUNT;
|
||||
|
||||
int scrape_engine(const SearchEngine *engine, const char *query,
|
||||
SearchResult **out_results, int max_results);
|
||||
|
||||
int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs);
|
||||
|
||||
#endif
|
||||
46
src/Utility/Display.c
Normal file
46
src/Utility/Display.c
Normal file
@@ -0,0 +1,46 @@
|
||||
#include "Display.h"
|
||||
#include <ctype.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
|
||||
char *pretty_display_url(const char *input) {
|
||||
if (!input) return NULL;
|
||||
|
||||
const char *start = input;
|
||||
|
||||
const char *protocol_pos = strstr(input, "://");
|
||||
if (protocol_pos) {
|
||||
start = protocol_pos + 3;
|
||||
}
|
||||
|
||||
if (strncasecmp(start, "www.", 4) == 0) {
|
||||
start += 4;
|
||||
}
|
||||
|
||||
size_t input_len = strlen(start);
|
||||
char temp[512];
|
||||
strncpy(temp, start, sizeof(temp) - 1);
|
||||
temp[sizeof(temp) - 1] = '\0';
|
||||
|
||||
if (input_len > 0 && temp[input_len - 1] == '/') {
|
||||
temp[input_len - 1] = '\0';
|
||||
}
|
||||
|
||||
char *output = (char *)malloc(strlen(temp) * 3 + 1);
|
||||
if (!output) return NULL;
|
||||
|
||||
size_t j = 0;
|
||||
for (size_t i = 0; temp[i] != '\0'; i++) {
|
||||
if (temp[i] == '/') {
|
||||
output[j++] = ' ';
|
||||
output[j++] = '>';
|
||||
output[j++] = ' ';
|
||||
} else {
|
||||
output[j++] = (char)tolower((unsigned char)temp[i]);
|
||||
}
|
||||
}
|
||||
output[j] = '\0';
|
||||
|
||||
return output;
|
||||
}
|
||||
6
src/Utility/Display.h
Normal file
6
src/Utility/Display.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef DISPLAY_H
|
||||
#define DISPLAY_H
|
||||
|
||||
char *pretty_display_url(const char *input);
|
||||
|
||||
#endif
|
||||
80
src/Utility/Unescape.c
Normal file
80
src/Utility/Unescape.c
Normal file
@@ -0,0 +1,80 @@
|
||||
#include "Unescape.h"
|
||||
#include "Utility.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
char *unescape_search_url(const char *input) {
|
||||
if (!input) return NULL;
|
||||
|
||||
const char *key = NULL;
|
||||
const char *start = NULL;
|
||||
const char *end = NULL;
|
||||
size_t len = 0;
|
||||
|
||||
if (strstr(input, "uddg=")) {
|
||||
key = "uddg=";
|
||||
start = strstr(input, key);
|
||||
if (!start) return NULL;
|
||||
start += strlen(key);
|
||||
end = strchr(start, '&');
|
||||
len = end ? (size_t)(end - start) : strlen(start);
|
||||
}
|
||||
|
||||
else if (strstr(input, "RU=")) {
|
||||
key = "RU=";
|
||||
start = strstr(input, key);
|
||||
if (!start) return strdup(input);
|
||||
start += strlen(key);
|
||||
end = strchr(start, '/');
|
||||
len = end ? (size_t)(end - start) : strlen(start);
|
||||
}
|
||||
|
||||
else {
|
||||
return strdup(input);
|
||||
}
|
||||
|
||||
char *output = (char *)malloc(len * 3 + 1);
|
||||
if (!output) return NULL;
|
||||
|
||||
size_t i = 0, j = 0;
|
||||
while (i < len) {
|
||||
if (start[i] == '%' && i + 2 < len) {
|
||||
int high = hex_to_int(start[i + 1]);
|
||||
int low = hex_to_int(start[i + 2]);
|
||||
if (high != -1 && low != -1) {
|
||||
output[j++] = (char)((high << 4) | low);
|
||||
i += 3;
|
||||
} else {
|
||||
output[j++] = start[i++];
|
||||
}
|
||||
} else if (start[i] == '+') {
|
||||
output[j++] = ' ';
|
||||
i++;
|
||||
} else {
|
||||
output[j++] = start[i++];
|
||||
}
|
||||
}
|
||||
output[j] = '\0';
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
char *url_decode_query(const char *src) {
|
||||
if (!src) return NULL;
|
||||
char *res = strdup(src);
|
||||
char *p = res;
|
||||
while (*src) {
|
||||
if (*src == '+') {
|
||||
*p++ = ' ';
|
||||
} else if (*src == '%' && src[1] && src[2]) {
|
||||
char hex[3] = {src[1], src[2], '\0'};
|
||||
*p++ = (char)strtol(hex, NULL, 16);
|
||||
src += 2;
|
||||
} else {
|
||||
*p++ = *src;
|
||||
}
|
||||
src++;
|
||||
}
|
||||
*p = '\0';
|
||||
return res;
|
||||
}
|
||||
10
src/Utility/Unescape.h
Normal file
10
src/Utility/Unescape.h
Normal file
@@ -0,0 +1,10 @@
|
||||
#ifndef UNESCAPE_H
|
||||
#define UNESCAPE_H
|
||||
|
||||
#include <stddef.h>
|
||||
|
||||
char *unescape_search_url(const char *input);
|
||||
char *url_decode_query(const char *src);
|
||||
|
||||
#endif
|
||||
|
||||
8
src/Utility/Utility.c
Normal file
8
src/Utility/Utility.c
Normal file
@@ -0,0 +1,8 @@
|
||||
#include "Utility.h"
|
||||
|
||||
int hex_to_int(char c) {
|
||||
if (c >= '0' && c <= '9') return c - '0';
|
||||
if (c >= 'a' && c <= 'f') return c - 'a' + 10;
|
||||
if (c >= 'A' && c <= 'F') return c - 'A' + 10;
|
||||
return -1;
|
||||
}
|
||||
6
src/Utility/Utility.h
Normal file
6
src/Utility/Utility.h
Normal file
@@ -0,0 +1,6 @@
|
||||
#ifndef UTILITY_H
|
||||
#define UTILITY_H
|
||||
|
||||
int hex_to_int(char c);
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user