summaryrefslogtreecommitdiffstats
path: root/src/api.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/api.c')
-rw-r--r--src/api.c180
1 files changed, 117 insertions, 63 deletions
diff --git a/src/api.c b/src/api.c
index 76431cc..9a9bbc8 100644
--- a/src/api.c
+++ b/src/api.c
@@ -48,9 +48,10 @@ char * sc_api (struct sc_cache * c, char * body, char * headers, int isfmt, char
buf_length += readstatus;
if (buf_sizeof-buf_length < SC_HTTP_RBUFSIZE) {
buf_sizeof *= SC_REALLOC_K;
- buf = realloc(buf, sizeof(char)*buf_sizeof);
+ buf = realloc(buf, sizeof(char)*buf_sizeof); /* this IS safe, no matter how hard valgrind complains */
}
}
+ buf[buf_length++] = '\0';
if (readstatus == -1)
SC_LOG(SC_LOG_ERROR, c, "readstatus == -1, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint);
xmlNanoHTTPClose(r);
@@ -123,7 +124,7 @@ int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking
} /* TODO: be pedantic and remove utm_source and other tracking bullshit */
return 1;
}
-struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q) { /* check for cached queries first! */
+struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q, SC_OPT_TYPE opt) { /* check4cachedB4 */
/* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */
/* if query is not NULL, it MUST be initialized */
/*
@@ -147,7 +148,13 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
based http request-response based user interface so we can ask the user to complete the captcha. this is not yet
implemeted and will be hard work.
*/
- int rs;
+ int rs = 1;
+ char * xpath = NULL;
+ char * descclass = NULL;
+ char * titleclass = NULL;
+ char * imageclass = NULL;
+ htmlDocPtr xmldoc = NULL;
+ char * txtdoc = NULL;
if (!s || !c) {
rs = -1;
goto rc;
@@ -159,10 +166,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
qwasgiven++;
char * us = malloc(sizeof(char)*strlen(s)*3+1);
urlencode(us, s);
- char * xpath = NULL;
- char * descclass = NULL;
- char * titleclass = NULL;
- char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100", us);
+ txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100&ie=UTF-8%s", us, (opt&SC_OPT_IMAGE) ? "&tbm=isch" : "");
// fprintf(stdout, "%s\n", txtdoc);
free(us);
if (!txtdoc) {
@@ -170,91 +174,141 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
rs = -2;
goto rc;
}
- titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}");
- descclass = sc_find_class(txtdoc, "{word-break:break-word}");
- if (!titleclass || !descclass) {
- SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass");
- rs = -3;
- goto rc;
+ if (opt & SC_OPT_IMAGE) {
+ imageclass = sc_find_class(txtdoc, "{font-family:Roboto,Helvetica,Arial,sans-serif}");
+ if (!imageclass) {
+ SC_LOG(SC_LOG_ERROR, c, "!imageclass, txtdoc = %s", txtdoc);
+ rs = -3;
+ goto rc;
+ }
+ } else {
+ titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}");
+ descclass = sc_find_class(txtdoc, "{word-break:break-word}");
+ if (!titleclass || !descclass) {
+ SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass");
+ rs = -4;
+ goto rc;
+ }
}
#define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */
#define SC_GTXD /* description */ "../..//table//span[@class='%s']"
#define SC_GTXB /* breadcrumbs */ ".//span[@class='%s']"
+#define SC_GTXI "//div[@class='%s']//a"
#define SC_GTR q->results[q->results_length-1]
- xpath = malloc(strlen(titleclass)+strlen(SC_GTXF));
- sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */
- htmlDocPtr xmldoc = parseHtmlDocument(txtdoc, NULL);
+ xpath = malloc(strlen((opt & SC_OPT_IMAGE) ? imageclass : titleclass)+strlen((opt & SC_OPT_IMAGE) ? SC_GTXI : SC_GTXF));
+ sprintf(xpath, (opt & SC_OPT_IMAGE) ? SC_GTXI : SC_GTXF, (opt & SC_OPT_IMAGE) ? imageclass : titleclass);
+ xmldoc = parseHtmlDocument(txtdoc, NULL);
if (qwasgiven) /* as you can see, when q is given, queries will be write-locked for the whole XML processing time! */
SC_CWLE(c, c->queries_lock);
q->results_length = 0;
gnu_code_start;
- eachNodeX(xmldoc, xpath,
- lambda(void, (xmlNodePtr node, void * data),
- {
- if (node->type == XML_ELEMENT_NODE) {
- xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href");
- if (href) {
- char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* fuck rules, I will rewrite it anyways <= hi future me */
- sc_fix_url(&hreflink);
- char * x = malloc(strlen(descclass)+strlen(SC_GTXD));
- char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB));
- sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */);
- sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */);
- xmlNodePtr descnode = nthNodeXN(node, x, 0);
- if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */
- descnode = nthNodeXN(node, "../../div/div", 0);
- xmlNodePtr breadnode = nthNodeXN(node, xbread, 0);
- free(x);
- free(xbread);
- if (q->results_sizeof <= q->results_length)
- SC_BIGGER_ARRAY(q->results, sc_result);
- q->results_length++;
- SC_GTR->query = q;
- SC_GTR->title = (char *) xmlNodeGetContent(node->children);
- if (!SC_GTR->title) {
- SC_GTR->title = malloc(strlen(SC_I18N_NO_TITLE)+1);
- strcpy(SC_GTR->title, SC_I18N_NO_TITLE);
- }
- SC_GTR->url = hreflink;
- if (!SC_GTR->url) {
- SC_GTR->url = malloc(strlen(SC_I18N_NO_HREFLINK)+1);
- strcpy(SC_GTR->url, SC_I18N_NO_HREFLINK);
- }
- SC_GTR->desc = (char *) xmlNodeGetContent(descnode);
- if (!SC_GTR->desc) {
- SC_GTR->desc = malloc(strlen(SC_I18N_NO_DESCRIPTION)+1);
- strcpy(SC_GTR->desc, SC_I18N_NO_DESCRIPTION);
- }
- SC_GTR->breadcrumbs = (char *) xmlNodeGetContent(breadnode);
- if (!SC_GTR->breadcrumbs) {
- SC_GTR->breadcrumbs = malloc(strlen(SC_GTR->url)+1);
- strcpy(SC_GTR->breadcrumbs, SC_GTR->url);
- }
- }
+ void sc_query_google_eachnode (xmlNodePtr node, void * data) {
+ if (node->type == XML_ELEMENT_NODE) {
+ xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href");
+ if (href) {
+ char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* xmlGetProp copies and allocates */
+ if (!hreflink) {
+ SC_LOG(SC_LOG_ERROR, c, "!hreflink");
+ rs = -5;
+ return;
+ }
+ if (opt & SC_OPT_IMAGE) {
+ char * imgurl = NULL; /* do not free those when allocated by sscanf, as they will directly go into the struct. */
+ char * imgrefurl = NULL; /* easy, huh? */
+ SC_LOG(SC_LOG_DEBUG, c, "hreflink = %s", hreflink);
+ sscanf(hreflink, "/imgres?imgurl=%m[^&]&imgrefurl=%m[^&]", &imgurl, &imgrefurl);
+ if (!imgurl && !imgrefurl) {
+ SC_LOG(SC_LOG_ERROR, c, "!imgurl && !imgrefurl");
+ /* rs = -6; */ /* we continue running not fail because of a single picture */
+ free(imgurl);
+ free(imgrefurl);
+ return; /* check! */
+ }
+ urldecode(imgurl, imgurl);
+ urldecode(imgrefurl, imgrefurl);
+ if (q->results_sizeof <= q->results_length)
+ SC_BIGGER_ARRAY(q->results, sc_result, 1);
+ q->results_length++;
+ SC_GTR->query = q;
+ SC_GTR->title = NULL; /* can't get title from here, would have to load /imgres, which is bloat */
+ SC_GTR->url = imgrefurl;
+ SC_GTR->desc = imgurl;
+ SC_GTR->breadcrumbs = NULL;
+ } else {
+ char * orig_hreflink_for_free = hreflink;
+ sc_fix_url(&hreflink);
+ char * x = malloc(strlen(descclass)+strlen(SC_GTXD));
+ char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB));
+ sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */);
+ sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */);
+ xmlNodePtr descnode = nthNodeXN(node, x, 0);
+ if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */
+ descnode = nthNodeXN(node, "../../div/div", 0);
+ xmlNodePtr breadnode = nthNodeXN(node, xbread, 0);
+ free(x);
+ free(xbread);
+ if (q->results_sizeof <= q->results_length)
+ SC_BIGGER_ARRAY(q->results, sc_result, 1);
+ q->results_length++;
+ SC_GTR->query = q;
+ char * cp = (char *) xmlNodeGetContent(node->children);
+ if (cp) {
+ SC_GTR->title = malloc(strlen(cp)+1);
+ strcpy(SC_GTR->title, cp);
+ xmlFree(cp);
+ } else SC_GTR->title = NULL;
+ if (hreflink) {
+ SC_GTR->url = malloc(strlen(hreflink)+1);
+ strcpy(SC_GTR->url, hreflink);
+ xmlFree(orig_hreflink_for_free);
+ } else SC_GTR->url = NULL;
+ cp = (char *) xmlNodeGetContent(descnode);
+ if (cp) {
+ SC_GTR->desc = malloc(strlen(cp)+1);
+ strcpy(SC_GTR->desc, cp);
+ xmlFree(cp);
+ } else SC_GTR->desc = NULL;
+ cp = (char *) xmlNodeGetContent(breadnode);
+ if (cp) {
+ SC_GTR->breadcrumbs = malloc(strlen(cp)+1);
+ strcpy(SC_GTR->breadcrumbs, cp);
+ xmlFree(cp);
}
}
- ),
- NULL);
+ }
+ }
+ }
+ eachNodeX(xmldoc, xpath, sc_query_google_eachnode, NULL);
gnu_code_end;
+ if (rs < 0) {
+ SC_LOG(SC_LOG_ERROR, c, "rs < 0 (rs == %d)", rs);
+ if (qwasgiven)
+ SC_CUE(c, c->queries_lock);
+ goto rc;
+ }
q->cache = c;
q->lookup_time = time(NULL);
q->engines = SC_ENGINE_GOOGLE;
q->string = realloc(q->string, strlen(s)+1);
+ q->opt = opt;
strcpy(q->string, s);
if (!qwasgiven) {
SC_CWLE(c, c->queries_lock);
if (c->queries_sizeof <= c->queries_length)
- SC_BIGGER_ARRAY(c->queries, sc_query);
+ SC_BIGGER_ARRAY(c->queries, sc_query, 0);
c->queries_length++;
#define SC_GTQ c->queries[c->queries_length-1]
SC_GTQ = q;
}
SC_CUE(c, c->queries_lock);
- xmlFreeDoc(xmldoc);
rc:
+ if (!qwasgiven && rs < 0)
+ sc_query_free(q);
+ xmlFreeDoc(xmldoc);
free(txtdoc);
free(titleclass);
free(descclass);
+ free(imageclass);
free(xpath);
return (rs < 0) ? NULL : q;
}