diff options
author | sijanec <anton@sijanec.eu> | 2021-04-01 23:30:37 +0200 |
---|---|---|
committer | sijanec <anton@sijanec.eu> | 2021-04-01 23:30:37 +0200 |
commit | 579048eaf89784ec1da8592d96311fafd49aea1a (patch) | |
tree | 61bf0c50c656f2b16ed8901ec3b07fb468ffb916 /src/api.c | |
download | sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.gz sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.bz2 sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.lz sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.xz sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.zst sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.zip |
Diffstat (limited to 'src/api.c')
-rw-r--r-- | src/api.c | 163 |
1 files changed, 163 insertions, 0 deletions
diff --git a/src/api.c b/src/api.c new file mode 100644 index 0000000..ae8d619 --- /dev/null +++ b/src/api.c @@ -0,0 +1,163 @@ +#define SC_CAPI(c, b, h, e, ...) sc_api(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__) +#define SC_CAPIX(c, b, h, e, ...) sc_capix(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__) +char * sc_api (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) { + if (!c || !endpoint) + return NULL; + size_t va_count = parse_printf_format(endpoint, 0, NULL); + char * endpoint_formatted = NULL; + long response_code = 0; + if (isfmt && va_count > 0 && endpoint_formatted == NULL) { + va_list ap, ap2; + va_start(ap, endpoint); + va_copy(ap2, ap); + size_t strlenm = vsnprintf(NULL, 0, endpoint, ap); + endpoint_formatted = malloc(sizeof(char)*strlenm+1); + vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2); + va_end(ap); + va_end(ap2); + } + if (!headers) + headers = ""; + char * hedrs = malloc(sizeof(char)*strlen(headers)+strlen(SC_HTTP_HEADERS)+1); + strcpy(hedrs, SC_HTTP_HEADERS); + strcat(hedrs, headers); + char * contentType = NULL; + char * redir = NULL; + char * buf = malloc(sizeof(char)*SC_HTTP_RBUFSIZE); + size_t buf_sizeof = SC_HTTP_RBUFSIZE; + size_t buf_length = 0; + int readstatus = 0; + void * r = xmlNanoHTTPMethodRedir( + endpoint_formatted ? endpoint_formatted : endpoint, + body ? "POST" : "GET", + body, + &contentType, + &redir, + hedrs, + body ? strlen(body) : 0 + ); + if (!r) { + SC_LOG(SC_LOG_ERROR, c, "!r, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint); + goto rc; + } + response_code = xmlNanoHTTPReturnCode(r); + if (!(response_code - 200 >= 0 && response_code - 200 < 100)) { + SC_LOG(SC_LOG_ERROR, c, "response_code == %ld, endpoint: %s", response_code, endpoint_formatted ? endpoint_formatted:endpoint); + } + while ((readstatus = xmlNanoHTTPRead(r, buf+buf_length, buf_sizeof-buf_length)) > 0) { + buf_length += readstatus; + if (buf_sizeof-buf_length < SC_HTTP_RBUFSIZE) { + buf_sizeof *= SC_REALLOC_K; + buf = realloc(buf, sizeof(char)*buf_sizeof); + } + } + if (readstatus == -1) + SC_LOG(SC_LOG_ERROR, c, "readstatus == -1, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint); + xmlNanoHTTPClose(r); + SC_LOG(SC_LOG_DEBUG, c, "contentType = %s, redir = %s", contentType ? contentType : "NULL", redir ? redir : "NULL"); +rc: + free(endpoint_formatted); + free(contentType); + free(redir); + free(hedrs); + return buf; +} +htmlDocPtr sc_capix (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) { + if (!c || !endpoint) + return NULL; + size_t va_count = parse_printf_format(endpoint, 0, NULL); + char * endpoint_formatted = NULL; + if (isfmt && va_count > 0 && endpoint_formatted == NULL) { + va_list ap, ap2; + va_start(ap, endpoint); + va_copy(ap2, ap); + size_t strlenm = vsnprintf(NULL, 0, endpoint, ap); + endpoint_formatted = malloc(sizeof(char)*strlenm+1); + vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2); + va_end(ap); + va_end(ap2); + } + char * buf = sc_api(c, body, headers, 0, endpoint_formatted ? endpoint_formatted : endpoint); + htmlDocPtr htmldoc = parseHtmlDocument(buf, endpoint_formatted ? endpoint_formatted : endpoint); + free(buf); + free(endpoint_formatted); + return htmldoc; +} +char * sc_find_class (char * haystack, const char * definition) { /* you must free class after calling */ + if (!haystack || !definition) + return NULL; + char * class = strstr(haystack, definition); + if (!class) + return NULL; + int found = 0; + for (; class > haystack; class--) + if (class[-1] == '.' && (found = 1)) + break; + if (!found) + return NULL; + char * endofclass = class; + found = 0; + for (; *endofclass; endofclass++) /* google only has alphanumeric class names. TODO: be pedantic and conformic to w3 stds */ + if (!isalnum(endofclass[0]) && (found = 1)) + break; + if (!found) + return NULL; + char * toreturn = malloc(endofclass-class+1); + strncpy(toreturn, class, endofclass-class); + toreturn[endofclass-class] = '\0'; + return toreturn; +} +int sc_query_google (char * s, struct sc_cache * c) { + /* + remarks: + * we are using wap.google.com over HTTP and with a user-agent string of a nokia mobile phone, so we get a lite website + * we determine which class holds a specific value by looking at the css definitions + - result title: the only class that has definition {color:#1967D2;font-size:14px;line-height:16px} + + A links have this class set, but they have a child SPAN element that then holds the text of the title + + A href points to a tracking relative link, starting with /url?q=. the q parameter contains the (obv urlencoded) link. + - result date: class has only one definition, {color:#70757a}, but same definition has the class for the settings A link. + + extract those two classes and find the one that is only present on SPAN text elements. + - result description: once we have the result div, the description is the // span with the appropriate class + + the appropriate class is the only one with {word-break:break-word}. note that this class also describes other elements. + - result div: to get the result div, we need the parent of the parent of the A link of the title. + * result dates are sometimes relative ("an hour ago") and heavily depend on the client location, based on IP. + - we won't parse those yet + * I couldn't find anything with ratings, so we won't parse thouse either yet + * captcha: google knows that this nokia phone we're pretending to be doesn't support javascript + - the request limiting captcha must work on a phone without javascript. it is probably loaded inside an iframe, but has + origin protection, so we can't just solve it client-side. we would have to proxy images and create some sort of a session + based http request-response based user interface so we can ask the user to complete the captcha. this is not yet + implemeted and will be hard work. + */ + if (!s || !c) + return -1; + int rs = 1; + char * us = malloc(sizeof(char)*strlen(s)*3+1); + urlencode(us, s); + char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s", us); + // fprintf(stdout, "%s\n", txtdoc); + free(us); + if (!txtdoc) { + rs = -2; + goto rc; + } + char * titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}"); + if (!titleclass) { + SC_LOG(SC_LOG_ERROR, c, "!titleclass"); + rs = -3; + goto rc; + } +#define SC_GTXF "/html/body//a[contains(@class, '%s')]" // @class='fuLhoc ZWRArf'" + char * xpath = malloc(strlen(titleclass)+strlen(SC_GTXF)); + sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */ + fprintf(stdout, "%s\n", xpath); + htmlDocPtr xmldoc = parseHtmlDocument(txtdoc, NULL); + xmlXPathObjectPtr nodes = findNodes(xmldoc, xpath); + eachNode(nodes, printLinkNode, NULL); +rc: + xmlFreeDoc(xmldoc); + free(txtdoc); + free(titleclass); + free(xpath); + return rs; +} |