#define SC_CAPI(c, b, h, e, ...) sc_api(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__) #define SC_CAPIX(c, b, h, e, ...) sc_capix(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__) char * sc_api (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) { if (!c || !endpoint) return NULL; size_t va_count = parse_printf_format(endpoint, 0, NULL); char * endpoint_formatted = NULL; long response_code = 0; if (isfmt && va_count > 0 && endpoint_formatted == NULL) { va_list ap, ap2; va_start(ap, endpoint); va_copy(ap2, ap); size_t strlenm = vsnprintf(NULL, 0, endpoint, ap); endpoint_formatted = malloc(sizeof(char)*strlenm+1); vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2); va_end(ap); va_end(ap2); } if (!headers) headers = ""; char * hedrs = malloc(sizeof(char)*strlen(headers)+strlen(SC_HTTP_HEADERS)+1); strcpy(hedrs, SC_HTTP_HEADERS); strcat(hedrs, headers); char * contentType = NULL; char * redir = NULL; char * buf = malloc(sizeof(char)*SC_HTTP_RBUFSIZE); size_t buf_sizeof = SC_HTTP_RBUFSIZE; size_t buf_length = 0; int readstatus = 0; void * r = xmlNanoHTTPMethodRedir( endpoint_formatted ? endpoint_formatted : endpoint, body ? "POST" : "GET", body, &contentType, &redir, hedrs, body ? strlen(body) : 0 ); if (!r) { SC_LOG(SC_LOG_ERROR, c, "!r, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint); goto rc; } response_code = xmlNanoHTTPReturnCode(r); if (!(response_code - 200 >= 0 && response_code - 200 < 100)) { SC_LOG(SC_LOG_ERROR, c, "response_code == %ld, endpoint: %s", response_code, endpoint_formatted ? endpoint_formatted:endpoint); } while ((readstatus = xmlNanoHTTPRead(r, buf+buf_length, buf_sizeof-buf_length)) > 0) { buf_length += readstatus; if (buf_sizeof-buf_length < SC_HTTP_RBUFSIZE) { buf_sizeof *= SC_REALLOC_K; buf = realloc(buf, sizeof(char)*buf_sizeof); /* this IS safe, no matter how hard valgrind complains */ } } buf[buf_length++] = '\0'; if (readstatus == -1) SC_LOG(SC_LOG_ERROR, c, "readstatus == -1, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint); xmlNanoHTTPClose(r); SC_LOG(SC_LOG_DEBUG, c, "contentType = %s, redir = %s", contentType ? contentType : "NULL", redir ? redir : "NULL"); rc: free(endpoint_formatted); free(contentType); free(redir); free(hedrs); return buf; } htmlDocPtr sc_capix (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) { if (!c || !endpoint) return NULL; size_t va_count = parse_printf_format(endpoint, 0, NULL); char * endpoint_formatted = NULL; if (isfmt && va_count > 0 && endpoint_formatted == NULL) { va_list ap, ap2; va_start(ap, endpoint); va_copy(ap2, ap); size_t strlenm = vsnprintf(NULL, 0, endpoint, ap); endpoint_formatted = malloc(sizeof(char)*strlenm+1); vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2); va_end(ap); va_end(ap2); } char * buf = sc_api(c, body, headers, 0, endpoint_formatted ? endpoint_formatted : endpoint); htmlDocPtr htmldoc = parseHtmlDocument(buf, endpoint_formatted ? endpoint_formatted : endpoint); free(buf); free(endpoint_formatted); return htmldoc; } char * sc_find_class (char * haystack, const char * definition) { /* you must free class after calling */ if (!haystack || !definition) return NULL; char * class = strstr(haystack, definition); if (!class) return NULL; int found = 0; for (; class > haystack; class--) if (class[-1] == '.' && (found = 1)) break; if (!found) return NULL; char * endofclass = class; found = 0; for (; *endofclass; endofclass++) /* google only has alphanumeric class names. TODO: be pedantic and conformic to w3 stds */ if (!isalnum(endofclass[0]) && (found = 1)) break; if (!found) return NULL; char * toreturn = malloc(endofclass-class+1); strncpy(toreturn, class, endofclass-class); toreturn[endofclass-class] = '\0'; return toreturn; } int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking nonsense, so resulting URL is shorter or equl) */ if (!h || !*h) /* stage 0: prevent accidental death */ return -1; if (!strncmp(*h, "/url?q=", strlen("/url?q="))) { /* stage 1: url may be tracking url by google results */ *h = *h+strlen("/url?q="); *strchrnul(*h, '&') = '\0'; urldecode(*h, *h); } char * c = NULL; if ((c = strstr(*h, "googleweblight.com/fp?u="))) { /* stage 2: url may be "light web" tracking url by google results */ *h = c+strlen("googleweblight.com/fp?u="); /* we could disable this with a cookie but meh, this is easier and _stateless_ */ *strchrnul(*h, '&') = '\0'; urldecode(*h, *h); } /* TODO: be pedantic and remove utm_source and other tracking bullshit */ return 1; } struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q, SC_OPT_TYPE opt) { /* check4cachedB4 */ /* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */ /* if query is not NULL, it MUST be initialized */ /* remarks: * we are using wap.google.com over HTTP and with a user-agent string of a nokia mobile phone, so we get a lite website * we determine which class holds a specific value by looking at the css definitions - result title: the only class that has definition {color:#1967D2;font-size:14px;line-height:16px} + A links have this class set, but they have a child SPAN element that then holds the text of the title + A href points to a tracking relative link, starting with /url?q=. the q parameter contains the (obv urlencoded) link. - result date: class has only one definition, {color:#70757a}, but same definition has the class for the settings A link. + extract those two classes and find the one that is only present on SPAN text elements. - result description: once we have the result div, the description is the //table//span with the appropriate class + the appropriate class is the only one with {word-break:break-word}. note that this class also describes other elements. - result div: to get the result div, we need the parent of the parent of the A link of the title. * result dates are sometimes relative ("an hour ago") and heavily depend on the client location, based on IP. - we won't parse those yet * I couldn't find anything with ratings, so we won't parse thouse either yet * captcha: google knows that this nokia phone we're pretending to be doesn't support javascript - the request limiting captcha must work on a phone without javascript. it is probably loaded inside an iframe, but has origin protection, so we can't just solve it client-side. we would have to proxy images and create some sort of a session based http request-response based user interface so we can ask the user to complete the captcha. this is not yet implemeted and will be hard work. */ int rs = 1; char * xpath = NULL; char * descclass = NULL; char * titleclass = NULL; char * imageclass = NULL; htmlDocPtr xmldoc = NULL; char * txtdoc = NULL; if (!s || !c) { rs = -1; goto rc; } int qwasgiven = 0; if (!q) q = sc_query_init(); else qwasgiven++; char * us = malloc(sizeof(char)*strlen(s)*3+1); urlencode(us, s); txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100&ie=UTF-8%s", us, (opt&SC_OPT_IMAGE) ? "&tbm=isch" : ""); // fprintf(stdout, "%s\n", txtdoc); free(us); if (!txtdoc) { SC_LOG(SC_LOG_ERROR, c, "!txtdoc"); rs = -2; goto rc; } if (opt & SC_OPT_IMAGE) { imageclass = sc_find_class(txtdoc, "{font-family:Roboto,Helvetica,Arial,sans-serif}"); if (!imageclass) { SC_LOG(SC_LOG_ERROR, c, "!imageclass, txtdoc = %s", txtdoc); rs = -3; goto rc; } } else { titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}"); descclass = sc_find_class(txtdoc, "{word-break:break-word}"); if (!titleclass || !descclass) { SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass"); rs = -4; goto rc; } } #define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */ #define SC_GTXD /* description */ "../..//table//span[@class='%s']" #define SC_GTXB /* breadcrumbs */ ".//span[@class='%s']" #define SC_GTXI "//div[@class='%s']//a" #define SC_GTR q->results[q->results_length-1] xpath = malloc(strlen((opt & SC_OPT_IMAGE) ? imageclass : titleclass)+strlen((opt & SC_OPT_IMAGE) ? SC_GTXI : SC_GTXF)); sprintf(xpath, (opt & SC_OPT_IMAGE) ? SC_GTXI : SC_GTXF, (opt & SC_OPT_IMAGE) ? imageclass : titleclass); xmldoc = parseHtmlDocument(txtdoc, NULL); if (qwasgiven) /* as you can see, when q is given, queries will be write-locked for the whole XML processing time! */ SC_CWLE(c, c->queries_lock); q->results_length = 0; gnu_code_start; void sc_query_google_eachnode (xmlNodePtr node, void * data) { if (node->type == XML_ELEMENT_NODE) { xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href"); if (href) { char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* xmlGetProp copies and allocates */ if (!hreflink) { SC_LOG(SC_LOG_ERROR, c, "!hreflink"); rs = -5; return; } if (opt & SC_OPT_IMAGE) { char * imgurl = NULL; /* do not free those when allocated by sscanf, as they will directly go into the struct. */ char * imgrefurl = NULL; /* easy, huh? */ SC_LOG(SC_LOG_DEBUG, c, "hreflink = %s", hreflink); sscanf(hreflink, "/imgres?imgurl=%m[^&]&imgrefurl=%m[^&]", &imgurl, &imgrefurl); if (!imgurl && !imgrefurl) { SC_LOG(SC_LOG_ERROR, c, "!imgurl && !imgrefurl"); /* rs = -6; */ /* we continue running not fail because of a single picture */ free(imgurl); free(imgrefurl); return; /* check! */ } urldecode(imgurl, imgurl); urldecode(imgrefurl, imgrefurl); if (q->results_sizeof <= q->results_length) SC_BIGGER_ARRAY(q->results, sc_result, 1); q->results_length++; SC_GTR->query = q; SC_GTR->title = NULL; /* can't get title from here, would have to load /imgres, which is bloat */ SC_GTR->url = imgrefurl; SC_GTR->desc = imgurl; SC_GTR->breadcrumbs = NULL; } else { char * orig_hreflink_for_free = hreflink; sc_fix_url(&hreflink); char * x = malloc(strlen(descclass)+strlen(SC_GTXD)); char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB)); sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */); sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */); xmlNodePtr descnode = nthNodeXN(node, x, 0); if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */ descnode = nthNodeXN(node, "../../div/div", 0); xmlNodePtr breadnode = nthNodeXN(node, xbread, 0); free(x); free(xbread); if (q->results_sizeof <= q->results_length) SC_BIGGER_ARRAY(q->results, sc_result, 1); q->results_length++; SC_GTR->query = q; char * cp = (char *) xmlNodeGetContent(node->children); if (cp) { SC_GTR->title = malloc(strlen(cp)+1); strcpy(SC_GTR->title, cp); xmlFree(cp); } else SC_GTR->title = NULL; if (hreflink) { SC_GTR->url = malloc(strlen(hreflink)+1); strcpy(SC_GTR->url, hreflink); xmlFree(orig_hreflink_for_free); } else SC_GTR->url = NULL; cp = (char *) xmlNodeGetContent(descnode); if (cp) { SC_GTR->desc = malloc(strlen(cp)+1); strcpy(SC_GTR->desc, cp); xmlFree(cp); } else SC_GTR->desc = NULL; cp = (char *) xmlNodeGetContent(breadnode); if (cp) { SC_GTR->breadcrumbs = malloc(strlen(cp)+1); strcpy(SC_GTR->breadcrumbs, cp); xmlFree(cp); } } } } } eachNodeX(xmldoc, xpath, sc_query_google_eachnode, NULL); gnu_code_end; if (rs < 0) { SC_LOG(SC_LOG_ERROR, c, "rs < 0 (rs == %d)", rs); if (qwasgiven) SC_CUE(c, c->queries_lock); goto rc; } q->cache = c; q->lookup_time = time(NULL); q->engines = SC_ENGINE_GOOGLE; q->string = realloc(q->string, strlen(s)+1); q->opt = opt; strcpy(q->string, s); if (!qwasgiven) { SC_CWLE(c, c->queries_lock); if (c->queries_sizeof <= c->queries_length) SC_BIGGER_ARRAY(c->queries, sc_query, 0); c->queries_length++; #define SC_GTQ c->queries[c->queries_length-1] SC_GTQ = q; } SC_CUE(c, c->queries_lock); rc: if (!qwasgiven && rs < 0) sc_query_free(q); xmlFreeDoc(xmldoc); free(txtdoc); free(titleclass); free(descclass); free(imageclass); free(xpath); return (rs < 0) ? NULL : q; }