fix: some attempts to resolve some issues with images
This commit is contained in:
@@ -28,113 +28,82 @@ static char *build_proxy_url(const char *image_url) {
|
|||||||
return proxy_url;
|
return proxy_url;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int parse_image_node(xmlNodePtr node, ImageResult *result) {
|
static char *extract_json_string(const char *json, const char *key) {
|
||||||
xmlNodePtr img_node = NULL;
|
if (!json || !key)
|
||||||
xmlNodePtr tit_node = NULL;
|
return NULL;
|
||||||
xmlNodePtr des_node = NULL;
|
|
||||||
xmlNodePtr thumb_link = NULL;
|
|
||||||
|
|
||||||
for (xmlNodePtr child = node->children; child; child = child->next) {
|
char search_key[64];
|
||||||
if (child->type != XML_ELEMENT_NODE)
|
snprintf(search_key, sizeof(search_key), "\"%s\"", key);
|
||||||
continue;
|
|
||||||
|
|
||||||
if (xmlStrcmp(child->name, (const xmlChar *)"a") == 0) {
|
const char *key_pos = strstr(json, search_key);
|
||||||
xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
|
if (!key_pos)
|
||||||
if (class) {
|
return NULL;
|
||||||
if (xmlStrstr(class, (const xmlChar *)"thumb") != NULL) {
|
|
||||||
thumb_link = child;
|
const char *colon = strchr(key_pos + strlen(search_key), ':');
|
||||||
for (xmlNodePtr thumb_child = child->children; thumb_child;
|
if (!colon)
|
||||||
thumb_child = thumb_child->next) {
|
return NULL;
|
||||||
if (xmlStrcmp(thumb_child->name, (const xmlChar *)"div") == 0) {
|
|
||||||
xmlChar *div_class =
|
colon++;
|
||||||
xmlGetProp(thumb_child, (const xmlChar *)"class");
|
while (*colon == ' ' || *colon == '\t' || *colon == '\n' || *colon == '\r')
|
||||||
if (div_class &&
|
colon++;
|
||||||
xmlStrcmp(div_class, (const xmlChar *)"cico") == 0) {
|
|
||||||
for (xmlNodePtr cico_child = thumb_child->children; cico_child;
|
if (*colon != '"')
|
||||||
cico_child = cico_child->next) {
|
return NULL;
|
||||||
if (xmlStrcmp(cico_child->name, (const xmlChar *)"img") ==
|
colon++;
|
||||||
0) {
|
|
||||||
img_node = cico_child;
|
size_t len = 0;
|
||||||
break;
|
const char *start = colon;
|
||||||
}
|
while (*colon && *colon != '"') {
|
||||||
}
|
if (*colon == '\\' && *(colon + 1))
|
||||||
}
|
colon++;
|
||||||
if (div_class)
|
colon++;
|
||||||
xmlFree(div_class);
|
len++;
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (xmlStrstr(class, (const xmlChar *)"tit") != NULL) {
|
|
||||||
tit_node = child;
|
|
||||||
}
|
|
||||||
xmlFree(class);
|
|
||||||
}
|
|
||||||
} else if (xmlStrcmp(child->name, (const xmlChar *)"div") == 0) {
|
|
||||||
xmlChar *class = xmlGetProp(child, (const xmlChar *)"class");
|
|
||||||
if (class && xmlStrcmp(class, (const xmlChar *)"meta") == 0) {
|
|
||||||
for (xmlNodePtr meta_child = child->children; meta_child;
|
|
||||||
meta_child = meta_child->next) {
|
|
||||||
if (xmlStrcmp(meta_child->name, (const xmlChar *)"div") == 0) {
|
|
||||||
xmlChar *div_class =
|
|
||||||
xmlGetProp(meta_child, (const xmlChar *)"class");
|
|
||||||
if (div_class) {
|
|
||||||
if (xmlStrcmp(div_class, (const xmlChar *)"des") == 0) {
|
|
||||||
des_node = meta_child;
|
|
||||||
}
|
|
||||||
xmlFree(div_class);
|
|
||||||
}
|
|
||||||
} else if (xmlStrcmp(meta_child->name, (const xmlChar *)"a") == 0) {
|
|
||||||
xmlChar *a_class = xmlGetProp(meta_child, (const xmlChar *)"class");
|
|
||||||
if (a_class && xmlStrstr(a_class, (const xmlChar *)"tit") != NULL) {
|
|
||||||
tit_node = meta_child;
|
|
||||||
}
|
|
||||||
if (a_class)
|
|
||||||
xmlFree(a_class);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (class)
|
|
||||||
xmlFree(class);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
xmlChar *iurl =
|
char *result = malloc(len + 1);
|
||||||
img_node ? xmlGetProp(img_node, (const xmlChar *)"src") : NULL;
|
if (!result)
|
||||||
xmlChar *full_url =
|
return NULL;
|
||||||
thumb_link ? xmlGetProp(thumb_link, (const xmlChar *)"href") : NULL;
|
|
||||||
xmlChar *title = des_node ? xmlNodeGetContent(des_node)
|
|
||||||
: (tit_node ? xmlNodeGetContent(tit_node) : NULL);
|
|
||||||
xmlChar *rurl =
|
|
||||||
tit_node ? xmlGetProp(tit_node, (const xmlChar *)"href") : NULL;
|
|
||||||
|
|
||||||
if (!iurl || strlen((char *)iurl) == 0) {
|
colon = start;
|
||||||
if (iurl)
|
size_t i = 0;
|
||||||
xmlFree(iurl);
|
while (*colon && *colon != '"') {
|
||||||
if (title)
|
if (*colon == '\\' && *(colon + 1))
|
||||||
xmlFree(title);
|
colon++;
|
||||||
if (rurl)
|
result[i++] = *colon++;
|
||||||
xmlFree(rurl);
|
}
|
||||||
if (full_url)
|
result[i] = '\0';
|
||||||
xmlFree(full_url);
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int parse_iusc_node(xmlNodePtr node, ImageResult *result) {
|
||||||
|
xmlChar *m_attr = xmlGetProp(node, (const xmlChar *)"m");
|
||||||
|
if (!m_attr)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
char *turl = extract_json_string((const char *)m_attr, "turl");
|
||||||
|
char *murl = extract_json_string((const char *)m_attr, "murl");
|
||||||
|
char *purl = extract_json_string((const char *)m_attr, "purl");
|
||||||
|
char *title = extract_json_string((const char *)m_attr, "t");
|
||||||
|
|
||||||
|
int ok = (turl != NULL && strlen(turl) > 0);
|
||||||
|
if (ok) {
|
||||||
|
char *proxy_url = build_proxy_url(turl);
|
||||||
|
result->thumbnail_url = proxy_url ? strdup(proxy_url) : strdup(turl);
|
||||||
|
free(proxy_url);
|
||||||
|
result->title =
|
||||||
|
title && strlen(title) > 0 ? strdup(title) : strdup("Image");
|
||||||
|
result->page_url = purl && strlen(purl) > 0 ? strdup(purl) : strdup("#");
|
||||||
|
result->full_url = murl && strlen(murl) > 0 ? strdup(murl) : strdup("#");
|
||||||
}
|
}
|
||||||
|
|
||||||
char *proxy_url = build_proxy_url((char *)iurl);
|
free(turl);
|
||||||
result->thumbnail_url = proxy_url ? strdup(proxy_url) : strdup((char *)iurl);
|
free(murl);
|
||||||
free(proxy_url);
|
free(purl);
|
||||||
result->title = strdup(title ? (char *)title : "Image");
|
free(title);
|
||||||
result->page_url = strdup(rurl ? (char *)rurl : "#");
|
|
||||||
result->full_url = strdup(full_url ? (char *)full_url : "#");
|
|
||||||
|
|
||||||
if (iurl)
|
xmlFree(m_attr);
|
||||||
xmlFree(iurl);
|
return ok;
|
||||||
if (title)
|
|
||||||
xmlFree(title);
|
|
||||||
if (rurl)
|
|
||||||
xmlFree(rurl);
|
|
||||||
if (full_url)
|
|
||||||
xmlFree(full_url);
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int scrape_images(const char *query, int page, ImageResult **out_results,
|
int scrape_images(const char *query, int page, ImageResult **out_results,
|
||||||
@@ -157,13 +126,16 @@ int scrape_images(const char *query, int page, ImageResult **out_results,
|
|||||||
|
|
||||||
char url[BUFFER_SIZE_LARGE];
|
char url[BUFFER_SIZE_LARGE];
|
||||||
int first = (page - 1) * IMAGE_RESULTS_PER_PAGE + 1;
|
int first = (page - 1) * IMAGE_RESULTS_PER_PAGE + 1;
|
||||||
snprintf(url, sizeof(url), "%s?q=%s&first=%d", BING_IMAGE_URL, encoded_query,
|
snprintf(
|
||||||
first);
|
url, sizeof(url),
|
||||||
|
"https://www.bing.com/images/async?q=%s&async=content&first=%d&count=%d",
|
||||||
|
encoded_query, first, 35);
|
||||||
free(encoded_query);
|
free(encoded_query);
|
||||||
|
|
||||||
HttpResponse resp = http_get(
|
HttpResponse resp = http_get(
|
||||||
url,
|
url,
|
||||||
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko");
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, "
|
||||||
|
"like Gecko) Chrome/120.0.0.0 Safari/537.36");
|
||||||
if (!resp.memory) {
|
if (!resp.memory) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
@@ -183,7 +155,7 @@ int scrape_images(const char *query, int page, ImageResult **out_results,
|
|||||||
}
|
}
|
||||||
|
|
||||||
xmlXPathObjectPtr xpathObj =
|
xmlXPathObjectPtr xpathObj =
|
||||||
xmlXPathEvalExpression((const xmlChar *)"//div[@class='item']", xpathCtx);
|
xmlXPathEvalExpression((const xmlChar *)"//a[@class='iusc']", xpathCtx);
|
||||||
|
|
||||||
if (!xpathObj || !xpathObj->nodesetval) {
|
if (!xpathObj || !xpathObj->nodesetval) {
|
||||||
if (xpathObj)
|
if (xpathObj)
|
||||||
@@ -210,7 +182,7 @@ int scrape_images(const char *query, int page, ImageResult **out_results,
|
|||||||
int count = 0;
|
int count = 0;
|
||||||
for (int i = 0; i < nodes && count < IMAGE_RESULTS_PER_PAGE; i++) {
|
for (int i = 0; i < nodes && count < IMAGE_RESULTS_PER_PAGE; i++) {
|
||||||
xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
|
xmlNodePtr node = xpathObj->nodesetval->nodeTab[i];
|
||||||
if (parse_image_node(node, &results[count])) {
|
if (parse_iusc_node(node, &results[count])) {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,6 +31,17 @@ static size_t write_callback(void *contents, size_t size, size_t nmemb,
|
|||||||
return realsize;
|
return realsize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct curl_slist *build_http_headers(void) {
|
||||||
|
struct curl_slist *headers = NULL;
|
||||||
|
headers = curl_slist_append(
|
||||||
|
headers,
|
||||||
|
"Accept: "
|
||||||
|
"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
||||||
|
headers = curl_slist_append(headers, "Accept-Language: en-US,en;q=0.5");
|
||||||
|
headers = curl_slist_append(headers, "DNT: 1");
|
||||||
|
return headers;
|
||||||
|
}
|
||||||
|
|
||||||
HttpResponse http_get(const char *url, const char *user_agent) {
|
HttpResponse http_get(const char *url, const char *user_agent) {
|
||||||
HttpResponse resp = {.memory = NULL, .size = 0, .capacity = 0};
|
HttpResponse resp = {.memory = NULL, .size = 0, .capacity = 0};
|
||||||
|
|
||||||
@@ -51,16 +62,24 @@ HttpResponse http_get(const char *url, const char *user_agent) {
|
|||||||
return resp;
|
return resp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct curl_slist *headers = build_http_headers();
|
||||||
|
|
||||||
curl_easy_setopt(curl, CURLOPT_URL, url);
|
curl_easy_setopt(curl, CURLOPT_URL, url);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_callback);
|
||||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp);
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp);
|
||||||
curl_easy_setopt(curl, CURLOPT_USERAGENT,
|
curl_easy_setopt(curl, CURLOPT_USERAGENT,
|
||||||
user_agent ? user_agent : "libcurl-agent/1.0");
|
user_agent ? user_agent : "libcurl-agent/1.0");
|
||||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 15L);
|
curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT_SECS);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2_0);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
|
||||||
|
curl_easy_setopt(curl, CURLOPT_DNS_CACHE_TIMEOUT, CURL_DNS_TIMEOUT_SECS);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1L);
|
||||||
apply_proxy_settings(curl);
|
apply_proxy_settings(curl);
|
||||||
|
|
||||||
CURLcode res = curl_easy_perform(curl);
|
CURLcode res = curl_easy_perform(curl);
|
||||||
|
curl_slist_free_all(headers);
|
||||||
curl_easy_cleanup(curl);
|
curl_easy_cleanup(curl);
|
||||||
|
|
||||||
if (res != CURLE_OK) {
|
if (res != CURLE_OK) {
|
||||||
|
|||||||
Reference in New Issue
Block a user