From bbf6fe2fd069ef89f73ecc3fe3ec2000833f05f4 Mon Sep 17 00:00:00 2001 From: sijanec Date: Sat, 3 Apr 2021 23:15:48 +0200 Subject: initial release --- src/api.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 88 insertions(+), 15 deletions(-) (limited to 'src/api.c') diff --git a/src/api.c b/src/api.c index ae8d619..6ad996e 100644 --- a/src/api.c +++ b/src/api.c @@ -107,7 +107,9 @@ char * sc_find_class (char * haystack, const char * definition) { /* you must fr toreturn[endofclass-class] = '\0'; return toreturn; } -int sc_query_google (char * s, struct sc_cache * c) { +struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q) { /* check for cached queries first! */ + /* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */ + /* if query is not NULL, it MUST be initialized */ /* remarks: * we are using wap.google.com over HTTP and with a user-agent string of a nokia mobile phone, so we get a lite website @@ -117,7 +119,7 @@ int sc_query_google (char * s, struct sc_cache * c) { + A href points to a tracking relative link, starting with /url?q=. the q parameter contains the (obv urlencoded) link. - result date: class has only one definition, {color:#70757a}, but same definition has the class for the settings A link. + extract those two classes and find the one that is only present on SPAN text elements. - - result description: once we have the result div, the description is the // span with the appropriate class + - result description: once we have the result div, the description is the //table//span with the appropriate class + the appropriate class is the only one with {word-break:break-word}. note that this class also describes other elements. - result div: to get the result div, we need the parent of the parent of the A link of the title. * result dates are sometimes relative ("an hour ago") and heavily depend on the client location, based on IP. @@ -129,35 +131,106 @@ int sc_query_google (char * s, struct sc_cache * c) { based http request-response based user interface so we can ask the user to complete the captcha. this is not yet implemeted and will be hard work. */ - if (!s || !c) - return -1; - int rs = 1; + int rs; + if (!s || !c) { + rs = -1; + goto rc; + } + int qwasgiven = 0; + if (!q) + q = sc_query_init(); + else + qwasgiven++; char * us = malloc(sizeof(char)*strlen(s)*3+1); urlencode(us, s); + char * xpath = NULL; + char * descclass = NULL; + char * titleclass = NULL; char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s", us); // fprintf(stdout, "%s\n", txtdoc); free(us); if (!txtdoc) { + SC_LOG(SC_LOG_ERROR, c, "!txtdoc"); rs = -2; goto rc; } - char * titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}"); - if (!titleclass) { - SC_LOG(SC_LOG_ERROR, c, "!titleclass"); + titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}"); + descclass = sc_find_class(txtdoc, "{word-break:break-word}"); + if (!titleclass || !descclass) { + SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass"); rs = -3; goto rc; } -#define SC_GTXF "/html/body//a[contains(@class, '%s')]" // @class='fuLhoc ZWRArf'" - char * xpath = malloc(strlen(titleclass)+strlen(SC_GTXF)); +#define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */ +#define SC_GTXD "../..//table//span[@class='%s']" +#define SC_GTR q->results[q->results_length-1] + xpath = malloc(strlen(titleclass)+strlen(SC_GTXF)); sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */ - fprintf(stdout, "%s\n", xpath); htmlDocPtr xmldoc = parseHtmlDocument(txtdoc, NULL); - xmlXPathObjectPtr nodes = findNodes(xmldoc, xpath); - eachNode(nodes, printLinkNode, NULL); -rc: + if (qwasgiven) /* as you can see, when q is given, queries will be write-locked for the whole XML processing time! */ + SC_CWLE(c, c->queries_lock); + q->results_length = 0; + gnu_code_start; + eachNodeX(xmldoc, xpath, + lambda(void, (xmlNodePtr node, void * data), + { + if (node->type == XML_ELEMENT_NODE) { + xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href"); + if (href) { + char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); + if (!strncmp(hreflink, "/url?q=", strlen("/url?q="))) { + hreflink = hreflink+strlen("/url?q="); + *strchrnul(hreflink, '&') = '\0'; + urldecode(hreflink, hreflink); + } + char * x = malloc(strlen(descclass)+strlen(SC_GTXD)); + sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */); + xmlNodePtr descnode = nthNodeXN(node, x, 0); + free(x); + if (q->results_sizeof <= q->results_length) + SC_BIGGER_ARRAY(q->results, sc_result); + q->results_length++; + SC_GTR->query = q; + SC_GTR->title = (char *) xmlNodeGetContent(node->children); + if (!SC_GTR->title) { + SC_GTR->title = malloc(strlen(SC_I18N_NO_TITLE)+1); + strcpy(SC_GTR->title, SC_I18N_NO_TITLE); + } + SC_GTR->url = hreflink; + if (!SC_GTR->url) { + SC_GTR->url = malloc(strlen(SC_I18N_NO_HREFLINK)+1); + strcpy(SC_GTR->url, SC_I18N_NO_HREFLINK); + } + SC_GTR->desc = (char *) xmlNodeGetContent(descnode); + if (!SC_GTR->desc) { + SC_GTR->desc = malloc(strlen(SC_I18N_NO_DESCRIPTION)+1); + strcpy(SC_GTR->desc, SC_I18N_NO_DESCRIPTION); + } + } + } + } + ), + NULL); + gnu_code_end; + q->cache = c; + q->lookup_time = time(NULL); + q->engines = SC_ENGINE_GOOGLE; + q->string = realloc(q->string, strlen(s)+1); + strcpy(q->string, s); + if (!qwasgiven) { + SC_CWLE(c, c->queries_lock); + if (c->queries_sizeof <= c->queries_length) + SC_BIGGER_ARRAY(c->queries, sc_query); + c->queries_length++; +#define SC_GTQ c->queries[c->queries_length-1] + SC_GTQ = q; + } + SC_CUE(c, c->queries_lock); xmlFreeDoc(xmldoc); +rc: free(txtdoc); free(titleclass); + free(descclass); free(xpath); - return rs; + return (rs < 0) ? NULL : q; } -- cgit v1.2.3