added proxying
This commit is contained in:
@@ -1,3 +1,16 @@
|
|||||||
[server]
|
[server]
|
||||||
host = 0.0.0.0
|
host = 0.0.0.0
|
||||||
port = 8000
|
port = 8000
|
||||||
|
|
||||||
|
[proxy]
|
||||||
|
# Single proxy (comment out to use list_file instead)
|
||||||
|
#proxy = "socks5://127.0.0.1:9050"
|
||||||
|
|
||||||
|
# Or use a proxy list file (one proxy per line)
|
||||||
|
#list_file = proxies.txt
|
||||||
|
|
||||||
|
#max_retries = 3
|
||||||
|
|
||||||
|
# Randomize proxy credentials for each request
|
||||||
|
#randomize_username = true
|
||||||
|
#randomize_password = true
|
||||||
|
|||||||
18
src/Config.c
18
src/Config.c
@@ -46,11 +46,13 @@ int load_config(const char *filename, Config *config) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
char *value_end = value + strlen(value) - 1;
|
char *value_end = value + strlen(value) - 1;
|
||||||
while (value_end > value && (*value_end == ' ' || *value_end == '\t')) {
|
while (value_end > value && (*value_end == ' ' || *value_end == '\t' || *value_end == '"' || *value_end == '\'')) {
|
||||||
*value_end = '\0';
|
*value_end = '\0';
|
||||||
value_end--;
|
value_end--;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
while (*value == '"' || *value == '\'') value++;
|
||||||
|
|
||||||
if (strcmp(section, "server") == 0) {
|
if (strcmp(section, "server") == 0) {
|
||||||
if (strcmp(key, "host") == 0) {
|
if (strcmp(key, "host") == 0) {
|
||||||
strncpy(config->host, value, sizeof(config->host) - 1);
|
strncpy(config->host, value, sizeof(config->host) - 1);
|
||||||
@@ -58,6 +60,20 @@ int load_config(const char *filename, Config *config) {
|
|||||||
} else if (strcmp(key, "port") == 0) {
|
} else if (strcmp(key, "port") == 0) {
|
||||||
config->port = atoi(value);
|
config->port = atoi(value);
|
||||||
}
|
}
|
||||||
|
} else if (strcmp(section, "proxy") == 0) {
|
||||||
|
if (strcmp(key, "proxy") == 0) {
|
||||||
|
strncpy(config->proxy, value, sizeof(config->proxy) - 1);
|
||||||
|
config->proxy[sizeof(config->proxy) - 1] = '\0';
|
||||||
|
} else if (strcmp(key, "list_file") == 0) {
|
||||||
|
strncpy(config->proxy_list_file, value, sizeof(config->proxy_list_file) - 1);
|
||||||
|
config->proxy_list_file[sizeof(config->proxy_list_file) - 1] = '\0';
|
||||||
|
} else if (strcmp(key, "max_retries") == 0) {
|
||||||
|
config->max_proxy_retries = atoi(value);
|
||||||
|
} else if (strcmp(key, "randomize_username") == 0) {
|
||||||
|
config->randomize_username = atoi(value);
|
||||||
|
} else if (strcmp(key, "randomize_password") == 0) {
|
||||||
|
config->randomize_password = atoi(value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,11 @@
|
|||||||
typedef struct {
|
typedef struct {
|
||||||
char host[256];
|
char host[256];
|
||||||
int port;
|
int port;
|
||||||
|
char proxy[256];
|
||||||
|
char proxy_list_file[256];
|
||||||
|
int max_proxy_retries;
|
||||||
|
int randomize_username;
|
||||||
|
int randomize_password;
|
||||||
} Config;
|
} Config;
|
||||||
|
|
||||||
int load_config(const char *filename, Config *config);
|
int load_config(const char *filename, Config *config);
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
#include "Dictionary.h"
|
#include "Dictionary.h"
|
||||||
|
#include "../Proxy/Proxy.h"
|
||||||
|
#include "../Scraping/Scraping.h"
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <libxml/HTMLparser.h>
|
#include <libxml/HTMLparser.h>
|
||||||
#include <libxml/xpath.h>
|
#include <libxml/xpath.h>
|
||||||
@@ -216,6 +218,7 @@ InfoBox fetch_dictionary_data(const char *query) {
|
|||||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk);
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &chunk);
|
||||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0");
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0");
|
||||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
|
apply_proxy_settings(curl);
|
||||||
|
|
||||||
if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
|
if (curl_easy_perform(curl) == CURLE_OK && chunk.size > 0) {
|
||||||
htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL,
|
htmlDocPtr doc = htmlReadMemory(chunk.memory, chunk.size, url, NULL,
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
#include "Wikipedia.h"
|
#include "Wikipedia.h"
|
||||||
|
#include "../Proxy/Proxy.h"
|
||||||
|
#include "../Scraping/Scraping.h"
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <libxml/parser.h>
|
#include <libxml/parser.h>
|
||||||
#include <libxml/tree.h>
|
#include <libxml/tree.h>
|
||||||
@@ -123,6 +125,7 @@ InfoBox fetch_wiki_data(char *api_url) {
|
|||||||
WikiWriteMemoryCallback);
|
WikiWriteMemoryCallback);
|
||||||
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
|
curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, (void *)&chunk);
|
||||||
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
|
curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, "libcurl-agent/1.0");
|
||||||
|
apply_proxy_settings(curl_handle);
|
||||||
|
|
||||||
res = curl_easy_perform(curl_handle);
|
res = curl_easy_perform(curl_handle);
|
||||||
|
|
||||||
|
|||||||
34
src/Main.c
34
src/Main.c
@@ -1,10 +1,13 @@
|
|||||||
#include <beaker.h>
|
#include <beaker.h>
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <libxml/parser.h>
|
#include <libxml/parser.h>
|
||||||
|
#include <signal.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#include "Config.h"
|
#include "Config.h"
|
||||||
|
#include "Proxy/Proxy.h"
|
||||||
|
#include "Scraping/Scraping.h"
|
||||||
#include "Routes/Home.h"
|
#include "Routes/Home.h"
|
||||||
#include "Routes/Images.h"
|
#include "Routes/Images.h"
|
||||||
#include "Routes/ImageProxy.h"
|
#include "Routes/ImageProxy.h"
|
||||||
@@ -17,17 +20,45 @@ int handle_opensearch(UrlParams *params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
|
sigset_t mask;
|
||||||
|
sigemptyset(&mask);
|
||||||
|
sigaddset(&mask, SIGPIPE);
|
||||||
|
pthread_sigmask(SIG_BLOCK, &mask, NULL);
|
||||||
|
|
||||||
LIBXML_TEST_VERSION
|
LIBXML_TEST_VERSION
|
||||||
xmlInitParser();
|
xmlInitParser();
|
||||||
|
|
||||||
curl_global_init(CURL_GLOBAL_DEFAULT);
|
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||||
|
|
||||||
Config config = {.host = "0.0.0.0", .port = 5000};
|
Config config = {
|
||||||
|
.host = "0.0.0.0",
|
||||||
|
.port = 5000,
|
||||||
|
.proxy = "",
|
||||||
|
.proxy_list_file = "",
|
||||||
|
.max_proxy_retries = 3,
|
||||||
|
.randomize_username = 0,
|
||||||
|
.randomize_password = 0
|
||||||
|
};
|
||||||
|
|
||||||
if (load_config("config.ini", &config) != 0) {
|
if (load_config("config.ini", &config) != 0) {
|
||||||
fprintf(stderr, "Warning: Could not load config file, using defaults\n");
|
fprintf(stderr, "Warning: Could not load config file, using defaults\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (config.proxy_list_file[0] != '\0') {
|
||||||
|
if (load_proxy_list(config.proxy_list_file) < 0) {
|
||||||
|
fprintf(stderr, "Warning: Failed to load proxy list, continuing without proxies\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
max_proxy_retries = config.max_proxy_retries;
|
||||||
|
set_proxy_config(config.proxy, config.randomize_username, config.randomize_password);
|
||||||
|
|
||||||
|
if (proxy_url[0] != '\0') {
|
||||||
|
fprintf(stderr, "Using proxy: %s\n", proxy_url);
|
||||||
|
} else if (proxy_count > 0) {
|
||||||
|
fprintf(stderr, "Using %d proxies from %s\n", proxy_count, config.proxy_list_file);
|
||||||
|
}
|
||||||
|
|
||||||
set_handler("/", home_handler);
|
set_handler("/", home_handler);
|
||||||
set_handler("/opensearch.xml", handle_opensearch);
|
set_handler("/opensearch.xml", handle_opensearch);
|
||||||
set_handler("/search", results_handler);
|
set_handler("/search", results_handler);
|
||||||
@@ -47,5 +78,6 @@ int main() {
|
|||||||
|
|
||||||
curl_global_cleanup();
|
curl_global_cleanup();
|
||||||
xmlCleanupParser();
|
xmlCleanupParser();
|
||||||
|
free_proxy_list();
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
257
src/Proxy/Proxy.c
Normal file
257
src/Proxy/Proxy.c
Normal file
@@ -0,0 +1,257 @@
|
|||||||
|
#include "Proxy.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <time.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
|
||||||
|
Proxy *proxy_list = NULL;
|
||||||
|
int proxy_count = 0;
|
||||||
|
int max_proxy_retries = 3;
|
||||||
|
int randomize_username = 0;
|
||||||
|
int randomize_password = 0;
|
||||||
|
char proxy_url[512] = {0};
|
||||||
|
static pthread_mutex_t proxy_mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
||||||
|
static const char RAND_CHARS[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
|
||||||
|
|
||||||
|
static void generate_random_string(char *buf, size_t len) {
|
||||||
|
for (size_t i = 0; i < len - 1; i++) {
|
||||||
|
buf[i] = RAND_CHARS[rand() % (sizeof(RAND_CHARS) - 1)];
|
||||||
|
}
|
||||||
|
buf[len - 1] = '\0';
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_proxy_config(const char *proxy_str, int rand_user, int rand_pass) {
|
||||||
|
if (proxy_str && proxy_str[0]) {
|
||||||
|
strncpy(proxy_url, proxy_str, sizeof(proxy_url) - 1);
|
||||||
|
proxy_url[sizeof(proxy_url) - 1] = '\0';
|
||||||
|
}
|
||||||
|
randomize_username = rand_user;
|
||||||
|
randomize_password = rand_pass;
|
||||||
|
}
|
||||||
|
|
||||||
|
static Proxy parse_proxy_line(const char *line) {
|
||||||
|
Proxy proxy = {.type = PROXY_SOCKS5, .port = 0, .username[0] = '\0', .password[0] = '\0', .failures = 0};
|
||||||
|
const char *host_start = NULL;
|
||||||
|
const char *port_start = NULL;
|
||||||
|
|
||||||
|
size_t len = strlen(line);
|
||||||
|
if (len == 0) return proxy;
|
||||||
|
|
||||||
|
if (strncmp(line, "http://", 7) == 0) {
|
||||||
|
proxy.type = PROXY_HTTP;
|
||||||
|
host_start = line + 7;
|
||||||
|
} else if (strncmp(line, "socks5://", 9) == 0) {
|
||||||
|
proxy.type = PROXY_SOCKS5;
|
||||||
|
host_start = line + 9;
|
||||||
|
} else if (strncmp(line, "socks4://", 9) == 0) {
|
||||||
|
proxy.type = PROXY_SOCKS4;
|
||||||
|
host_start = line + 9;
|
||||||
|
} else {
|
||||||
|
host_start = line;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *at = strchr(host_start, '@');
|
||||||
|
if (at) {
|
||||||
|
char cred_buf[128];
|
||||||
|
size_t cred_len = at - host_start;
|
||||||
|
if (cred_len >= sizeof(cred_buf)) cred_len = sizeof(cred_buf) - 1;
|
||||||
|
strncpy(cred_buf, host_start, cred_len);
|
||||||
|
cred_buf[cred_len] = '\0';
|
||||||
|
|
||||||
|
char *colon = strchr(cred_buf, ':');
|
||||||
|
if (colon) {
|
||||||
|
size_t user_len = colon - cred_buf;
|
||||||
|
if (user_len >= sizeof(proxy.username)) user_len = sizeof(proxy.username) - 1;
|
||||||
|
strncpy(proxy.username, cred_buf, user_len);
|
||||||
|
proxy.username[user_len] = '\0';
|
||||||
|
strncpy(proxy.password, colon + 1, sizeof(proxy.password) - 1);
|
||||||
|
proxy.password[sizeof(proxy.password) - 1] = '\0';
|
||||||
|
}
|
||||||
|
host_start = at + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
port_start = strchr(host_start, ':');
|
||||||
|
if (port_start) {
|
||||||
|
char host_buf[256];
|
||||||
|
size_t host_len = port_start - host_start;
|
||||||
|
if (host_len >= sizeof(host_buf)) host_len = sizeof(host_buf) - 1;
|
||||||
|
strncpy(host_buf, host_start, host_len);
|
||||||
|
host_buf[host_len] = '\0';
|
||||||
|
snprintf(proxy.host, sizeof(proxy.host), "%.*s", (int)host_len, host_buf);
|
||||||
|
proxy.port = atoi(port_start + 1);
|
||||||
|
} else {
|
||||||
|
snprintf(proxy.host, sizeof(proxy.host), "%s", host_start);
|
||||||
|
}
|
||||||
|
|
||||||
|
return proxy;
|
||||||
|
}
|
||||||
|
|
||||||
|
int load_proxy_list(const char *filename) {
|
||||||
|
if (!filename || filename[0] == '\0') {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
pthread_mutex_lock(&proxy_mutex);
|
||||||
|
|
||||||
|
if (proxy_list) {
|
||||||
|
free(proxy_list);
|
||||||
|
proxy_list = NULL;
|
||||||
|
}
|
||||||
|
proxy_count = 0;
|
||||||
|
|
||||||
|
FILE *file = fopen(filename, "r");
|
||||||
|
if (!file) {
|
||||||
|
pthread_mutex_unlock(&proxy_mutex);
|
||||||
|
fprintf(stderr, "[WARN] Could not open proxy list file: %s\n", filename);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int capacity = 16;
|
||||||
|
proxy_list = (Proxy *)malloc(capacity * sizeof(Proxy));
|
||||||
|
if (!proxy_list) {
|
||||||
|
fclose(file);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
proxy_count = 0;
|
||||||
|
|
||||||
|
char line[512];
|
||||||
|
while (fgets(line, sizeof(line), file)) {
|
||||||
|
line[strcspn(line, "\r\n")] = 0;
|
||||||
|
|
||||||
|
if (line[0] == '\0' || line[0] == '#') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *p = line;
|
||||||
|
while (*p == ' ' || *p == '\t') p++;
|
||||||
|
|
||||||
|
char *end = p + strlen(p) - 1;
|
||||||
|
while (end > p && (*end == ' ' || *end == '\t')) {
|
||||||
|
*end = '\0';
|
||||||
|
end--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p[0] == '\0') continue;
|
||||||
|
|
||||||
|
Proxy proxy = parse_proxy_line(p);
|
||||||
|
if (proxy.port == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (proxy_count >= capacity) {
|
||||||
|
capacity *= 2;
|
||||||
|
Proxy *new_list = (Proxy *)realloc(proxy_list, capacity * sizeof(Proxy));
|
||||||
|
if (!new_list) {
|
||||||
|
free(proxy_list);
|
||||||
|
proxy_list = NULL;
|
||||||
|
proxy_count = 0;
|
||||||
|
fclose(file);
|
||||||
|
pthread_mutex_unlock(&proxy_mutex);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
proxy_list = new_list;
|
||||||
|
}
|
||||||
|
|
||||||
|
proxy_list[proxy_count++] = proxy;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(file);
|
||||||
|
fprintf(stderr, "[INFO] Loaded %d proxies from %s\n", proxy_count, filename);
|
||||||
|
pthread_mutex_unlock(&proxy_mutex);
|
||||||
|
return proxy_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free_proxy_list(void) {
|
||||||
|
pthread_mutex_lock(&proxy_mutex);
|
||||||
|
if (proxy_list) {
|
||||||
|
free(proxy_list);
|
||||||
|
proxy_list = NULL;
|
||||||
|
}
|
||||||
|
proxy_count = 0;
|
||||||
|
pthread_mutex_unlock(&proxy_mutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
Proxy *get_random_proxy(void) {
|
||||||
|
pthread_mutex_lock(&proxy_mutex);
|
||||||
|
if (proxy_count == 0) {
|
||||||
|
pthread_mutex_unlock(&proxy_mutex);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int start = rand() % proxy_count;
|
||||||
|
int checked = 0;
|
||||||
|
Proxy *selected = NULL;
|
||||||
|
|
||||||
|
while (checked < proxy_count) {
|
||||||
|
int idx = (start + checked) % proxy_count;
|
||||||
|
if (proxy_list[idx].failures < max_proxy_retries) {
|
||||||
|
selected = &proxy_list[idx];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
checked++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!selected) {
|
||||||
|
for (int i = 0; i < proxy_count; i++) {
|
||||||
|
proxy_list[i].failures = 0;
|
||||||
|
}
|
||||||
|
selected = &proxy_list[rand() % proxy_count];
|
||||||
|
}
|
||||||
|
|
||||||
|
pthread_mutex_unlock(&proxy_mutex);
|
||||||
|
return selected;
|
||||||
|
}
|
||||||
|
|
||||||
|
void record_proxy_failure(Proxy *proxy) {
|
||||||
|
if (!proxy) return;
|
||||||
|
pthread_mutex_lock(&proxy_mutex);
|
||||||
|
proxy->failures++;
|
||||||
|
pthread_mutex_unlock(&proxy_mutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
void apply_proxy_settings(CURL *curl) {
|
||||||
|
if (proxy_url[0] != '\0') {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXY, proxy_url);
|
||||||
|
if (strncmp(proxy_url, "socks5://", 9) == 0) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5);
|
||||||
|
} else if (strncmp(proxy_url, "socks4://", 9) == 0) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A);
|
||||||
|
} else {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (randomize_username || randomize_password) {
|
||||||
|
char userpwd[256];
|
||||||
|
char username[32] = {0};
|
||||||
|
char password[32] = {0};
|
||||||
|
|
||||||
|
if (randomize_username) generate_random_string(username, sizeof(username));
|
||||||
|
if (randomize_password) generate_random_string(password, sizeof(password));
|
||||||
|
|
||||||
|
snprintf(userpwd, sizeof(userpwd), "%s:%s", username, password);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, userpwd);
|
||||||
|
}
|
||||||
|
} else if (proxy_count > 0) {
|
||||||
|
Proxy *proxy = get_random_proxy();
|
||||||
|
if (proxy) {
|
||||||
|
char proxy_url_buf[512];
|
||||||
|
snprintf(proxy_url_buf, sizeof(proxy_url_buf), "%s:%d", proxy->host, proxy->port);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXY, proxy_url_buf);
|
||||||
|
if (proxy->type == PROXY_HTTP) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);
|
||||||
|
} else if (proxy->type == PROXY_SOCKS4) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS4A);
|
||||||
|
} else {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXYTYPE, CURLPROXY_SOCKS5);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (proxy->username[0] != '\0' || proxy->password[0] != '\0') {
|
||||||
|
char userpwd[128];
|
||||||
|
snprintf(userpwd, sizeof(userpwd), "%s:%s", proxy->username, proxy->password);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, userpwd);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
31
src/Proxy/Proxy.h
Normal file
31
src/Proxy/Proxy.h
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
#ifndef PROXY_H
|
||||||
|
#define PROXY_H
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
typedef enum { PROXY_HTTP, PROXY_SOCKS4, PROXY_SOCKS5 } ProxyType;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
ProxyType type;
|
||||||
|
char host[256];
|
||||||
|
int port;
|
||||||
|
char username[64];
|
||||||
|
char password[64];
|
||||||
|
int failures;
|
||||||
|
} Proxy;
|
||||||
|
|
||||||
|
extern Proxy *proxy_list;
|
||||||
|
extern int proxy_count;
|
||||||
|
extern int max_proxy_retries;
|
||||||
|
extern int randomize_username;
|
||||||
|
extern int randomize_password;
|
||||||
|
extern char proxy_url[512];
|
||||||
|
|
||||||
|
int load_proxy_list(const char *filename);
|
||||||
|
void free_proxy_list(void);
|
||||||
|
Proxy *get_random_proxy(void);
|
||||||
|
void record_proxy_failure(Proxy *proxy);
|
||||||
|
void apply_proxy_settings(CURL *curl);
|
||||||
|
void set_proxy_config(const char *proxy_str, int rand_user, int rand_pass);
|
||||||
|
|
||||||
|
#endif
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
#include "ImageProxy.h"
|
#include "ImageProxy.h"
|
||||||
|
#include "../Proxy/Proxy.h"
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
@@ -118,6 +118,7 @@ int image_proxy_handler(UrlParams *params) {
|
|||||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
|
||||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
|
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
|
||||||
|
apply_proxy_settings(curl);
|
||||||
|
|
||||||
CURLcode res = curl_easy_perform(curl);
|
CURLcode res = curl_easy_perform(curl);
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,7 @@
|
|||||||
#include "Images.h"
|
#include "Images.h"
|
||||||
#include "../Utility/Unescape.h"
|
#include "../Utility/Unescape.h"
|
||||||
|
#include "../Proxy/Proxy.h"
|
||||||
|
#include "../Scraping/Scraping.h"
|
||||||
|
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <libxml/HTMLparser.h>
|
#include <libxml/HTMLparser.h>
|
||||||
@@ -50,6 +52,7 @@ static char *fetch_images_html(const char *url) {
|
|||||||
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
|
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
|
||||||
curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L);
|
curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 10L);
|
curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, 10L);
|
||||||
|
apply_proxy_settings(curl_handle);
|
||||||
|
|
||||||
CURLcode res = curl_easy_perform(curl_handle);
|
CURLcode res = curl_easy_perform(curl_handle);
|
||||||
if (res != CURLE_OK) {
|
if (res != CURLE_OK) {
|
||||||
@@ -247,7 +250,7 @@ int images_handler(UrlParams *params) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
image_matrix[image_count] = malloc(sizeof(char *) * 4);
|
image_matrix[image_count] = malloc(sizeof(char *) * 4);
|
||||||
image_matrix[image_count][0] = proxy_url ? proxy_url : strdup((char *)iurl);
|
image_matrix[image_count][0] = proxy_url ? strdup(proxy_url) : strdup((char *)iurl);
|
||||||
image_matrix[image_count][1] = strdup(title ? (char *)title : "Image");
|
image_matrix[image_count][1] = strdup(title ? (char *)title : "Image");
|
||||||
image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#");
|
image_matrix[image_count][2] = strdup(rurl ? (char *)rurl : "#");
|
||||||
image_matrix[image_count][3] = strdup(full_url ? (char *)full_url : "#");
|
image_matrix[image_count][3] = strdup(full_url ? (char *)full_url : "#");
|
||||||
|
|||||||
@@ -88,10 +88,10 @@ static int add_infobox_to_collection(InfoBox *infobox, char ****collection,
|
|||||||
(int *)realloc(*inner_counts, sizeof(int) * (current_count + 1));
|
(int *)realloc(*inner_counts, sizeof(int) * (current_count + 1));
|
||||||
|
|
||||||
(*collection)[current_count] = (char **)malloc(sizeof(char *) * 4);
|
(*collection)[current_count] = (char **)malloc(sizeof(char *) * 4);
|
||||||
(*collection)[current_count][0] = infobox->title;
|
(*collection)[current_count][0] = infobox->title ? strdup(infobox->title) : NULL;
|
||||||
(*collection)[current_count][1] = infobox->thumbnail_url;
|
(*collection)[current_count][1] = infobox->thumbnail_url ? strdup(infobox->thumbnail_url) : NULL;
|
||||||
(*collection)[current_count][2] = infobox->extract;
|
(*collection)[current_count][2] = infobox->extract ? strdup(infobox->extract) : NULL;
|
||||||
(*collection)[current_count][3] = infobox->url;
|
(*collection)[current_count][3] = infobox->url ? strdup(infobox->url) : NULL;
|
||||||
(*inner_counts)[current_count] = 4;
|
(*inner_counts)[current_count] = 4;
|
||||||
|
|
||||||
return current_count + 1;
|
return current_count + 1;
|
||||||
@@ -151,6 +151,10 @@ int results_handler(UrlParams *params) {
|
|||||||
jobs[i].max_results = 10;
|
jobs[i].max_results = 10;
|
||||||
jobs[i].results_count = 0;
|
jobs[i].results_count = 0;
|
||||||
jobs[i].page = page;
|
jobs[i].page = page;
|
||||||
|
jobs[i].handle = NULL;
|
||||||
|
jobs[i].response.memory = NULL;
|
||||||
|
jobs[i].response.size = 0;
|
||||||
|
jobs[i].response.capacity = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
scrape_engines_parallel(jobs, ENGINE_COUNT);
|
scrape_engines_parallel(jobs, ENGINE_COUNT);
|
||||||
@@ -185,6 +189,10 @@ int results_handler(UrlParams *params) {
|
|||||||
if (infobox_count > 0) {
|
if (infobox_count > 0) {
|
||||||
context_set_array_of_arrays(&ctx, "infoboxes", infobox_matrix,
|
context_set_array_of_arrays(&ctx, "infoboxes", infobox_matrix,
|
||||||
infobox_count, infobox_inner_counts);
|
infobox_count, infobox_inner_counts);
|
||||||
|
for (int i = 0; i < infobox_count; i++) {
|
||||||
|
for (int j = 0; j < 4; j++) free(infobox_matrix[i][j]);
|
||||||
|
free(infobox_matrix[i]);
|
||||||
|
}
|
||||||
free(infobox_matrix);
|
free(infobox_matrix);
|
||||||
free(infobox_inner_counts);
|
free(infobox_inner_counts);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#include "Scraping.h"
|
#include "Scraping.h"
|
||||||
|
#include "../Proxy/Proxy.h"
|
||||||
#include "../Utility/Unescape.h"
|
#include "../Utility/Unescape.h"
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <libxml/HTMLparser.h>
|
#include <libxml/HTMLparser.h>
|
||||||
@@ -329,9 +330,14 @@ static void configure_curl_handle(CURL *curl, const char *full_url,
|
|||||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
|
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
|
||||||
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
|
||||||
curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
|
curl_easy_setopt(curl, CURLOPT_COOKIEFILE, "");
|
||||||
|
|
||||||
|
apply_proxy_settings(curl);
|
||||||
}
|
}
|
||||||
|
|
||||||
int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
||||||
|
int retries = 0;
|
||||||
|
|
||||||
|
retry:
|
||||||
CURLM *multi_handle = curl_multi_init();
|
CURLM *multi_handle = curl_multi_init();
|
||||||
if (!multi_handle) {
|
if (!multi_handle) {
|
||||||
return -1;
|
return -1;
|
||||||
@@ -339,6 +345,15 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
|||||||
|
|
||||||
for (int i = 0; i < num_jobs; i++) {
|
for (int i = 0; i < num_jobs; i++) {
|
||||||
ScrapeJob *job = &jobs[i];
|
ScrapeJob *job = &jobs[i];
|
||||||
|
|
||||||
|
if (job->handle) {
|
||||||
|
curl_easy_cleanup(job->handle);
|
||||||
|
job->handle = NULL;
|
||||||
|
}
|
||||||
|
if (job->response.memory) {
|
||||||
|
free(job->response.memory);
|
||||||
|
}
|
||||||
|
|
||||||
job->handle = curl_easy_init();
|
job->handle = curl_easy_init();
|
||||||
if (!job->handle) {
|
if (!job->handle) {
|
||||||
continue;
|
continue;
|
||||||
@@ -406,7 +421,7 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
|||||||
CURL *handle = msg->easy_handle;
|
CURL *handle = msg->easy_handle;
|
||||||
|
|
||||||
for (int i = 0; i < num_jobs; i++) {
|
for (int i = 0; i < num_jobs; i++) {
|
||||||
if (jobs[i].handle == handle) {
|
if (jobs[i].handle && jobs[i].handle == handle) {
|
||||||
ScrapeJob *job = &jobs[i];
|
ScrapeJob *job = &jobs[i];
|
||||||
|
|
||||||
long response_code;
|
long response_code;
|
||||||
@@ -431,8 +446,10 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
|||||||
if (headers) curl_slist_free_all(headers);
|
if (headers) curl_slist_free_all(headers);
|
||||||
|
|
||||||
free(job->response.memory);
|
free(job->response.memory);
|
||||||
|
job->response.memory = NULL;
|
||||||
curl_multi_remove_handle(multi_handle, handle);
|
curl_multi_remove_handle(multi_handle, handle);
|
||||||
curl_easy_cleanup(handle);
|
if (handle) curl_easy_cleanup(handle);
|
||||||
|
job->handle = NULL;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -440,6 +457,21 @@ int scrape_engines_parallel(ScrapeJob *jobs, int num_jobs) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
curl_multi_cleanup(multi_handle);
|
curl_multi_cleanup(multi_handle);
|
||||||
|
|
||||||
|
if (retries < max_proxy_retries && proxy_count > 0) {
|
||||||
|
int any_failed = 0;
|
||||||
|
for (int i = 0; i < num_jobs; i++) {
|
||||||
|
if (jobs[i].results_count == 0 && jobs[i].response.size == 0) {
|
||||||
|
any_failed = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (any_failed) {
|
||||||
|
retries++;
|
||||||
|
goto retry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,11 +4,6 @@
|
|||||||
#include <libxml/HTMLparser.h>
|
#include <libxml/HTMLparser.h>
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
|
|
||||||
#define LOG_INFO(msg, ...) fprintf(stderr, "[INFO] " msg "\n", ##__VA_ARGS__)
|
|
||||||
#define LOG_WARN(msg, ...) fprintf(stderr, "[WARN] " msg "\n", ##__VA_ARGS__)
|
|
||||||
#define LOG_DEBUG(msg, ...) fprintf(stderr, "[DEBUG] " msg "\n", ##__VA_ARGS__)
|
|
||||||
#define LOG_ERROR(msg, ...) fprintf(stderr, "[ERROR] " msg "\n", ##__VA_ARGS__)
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
char *url;
|
char *url;
|
||||||
char *title;
|
char *title;
|
||||||
|
|||||||
Reference in New Issue
Block a user