summaryrefslogtreecommitdiffstats
path: root/src/api.c
diff options
context:
space:
mode:
authorsijanec <anton@sijanec.eu>2021-04-01 23:30:37 +0200
committersijanec <anton@sijanec.eu>2021-04-01 23:30:37 +0200
commit579048eaf89784ec1da8592d96311fafd49aea1a (patch)
tree61bf0c50c656f2b16ed8901ec3b07fb468ffb916 /src/api.c
downloadsear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar
sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.gz
sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.bz2
sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.lz
sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.xz
sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.tar.zst
sear.c-579048eaf89784ec1da8592d96311fafd49aea1a.zip
Diffstat (limited to 'src/api.c')
-rw-r--r--src/api.c163
1 files changed, 163 insertions, 0 deletions
diff --git a/src/api.c b/src/api.c
new file mode 100644
index 0000000..ae8d619
--- /dev/null
+++ b/src/api.c
@@ -0,0 +1,163 @@
+#define SC_CAPI(c, b, h, e, ...) sc_api(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__)
+#define SC_CAPIX(c, b, h, e, ...) sc_capix(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__)
+char * sc_api (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) {
+ if (!c || !endpoint)
+ return NULL;
+ size_t va_count = parse_printf_format(endpoint, 0, NULL);
+ char * endpoint_formatted = NULL;
+ long response_code = 0;
+ if (isfmt && va_count > 0 && endpoint_formatted == NULL) {
+ va_list ap, ap2;
+ va_start(ap, endpoint);
+ va_copy(ap2, ap);
+ size_t strlenm = vsnprintf(NULL, 0, endpoint, ap);
+ endpoint_formatted = malloc(sizeof(char)*strlenm+1);
+ vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2);
+ va_end(ap);
+ va_end(ap2);
+ }
+ if (!headers)
+ headers = "";
+ char * hedrs = malloc(sizeof(char)*strlen(headers)+strlen(SC_HTTP_HEADERS)+1);
+ strcpy(hedrs, SC_HTTP_HEADERS);
+ strcat(hedrs, headers);
+ char * contentType = NULL;
+ char * redir = NULL;
+ char * buf = malloc(sizeof(char)*SC_HTTP_RBUFSIZE);
+ size_t buf_sizeof = SC_HTTP_RBUFSIZE;
+ size_t buf_length = 0;
+ int readstatus = 0;
+ void * r = xmlNanoHTTPMethodRedir(
+ endpoint_formatted ? endpoint_formatted : endpoint,
+ body ? "POST" : "GET",
+ body,
+ &contentType,
+ &redir,
+ hedrs,
+ body ? strlen(body) : 0
+ );
+ if (!r) {
+ SC_LOG(SC_LOG_ERROR, c, "!r, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint);
+ goto rc;
+ }
+ response_code = xmlNanoHTTPReturnCode(r);
+ if (!(response_code - 200 >= 0 && response_code - 200 < 100)) {
+ SC_LOG(SC_LOG_ERROR, c, "response_code == %ld, endpoint: %s", response_code, endpoint_formatted ? endpoint_formatted:endpoint);
+ }
+ while ((readstatus = xmlNanoHTTPRead(r, buf+buf_length, buf_sizeof-buf_length)) > 0) {
+ buf_length += readstatus;
+ if (buf_sizeof-buf_length < SC_HTTP_RBUFSIZE) {
+ buf_sizeof *= SC_REALLOC_K;
+ buf = realloc(buf, sizeof(char)*buf_sizeof);
+ }
+ }
+ if (readstatus == -1)
+ SC_LOG(SC_LOG_ERROR, c, "readstatus == -1, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint);
+ xmlNanoHTTPClose(r);
+ SC_LOG(SC_LOG_DEBUG, c, "contentType = %s, redir = %s", contentType ? contentType : "NULL", redir ? redir : "NULL");
+rc:
+ free(endpoint_formatted);
+ free(contentType);
+ free(redir);
+ free(hedrs);
+ return buf;
+}
+htmlDocPtr sc_capix (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) {
+ if (!c || !endpoint)
+ return NULL;
+ size_t va_count = parse_printf_format(endpoint, 0, NULL);
+ char * endpoint_formatted = NULL;
+ if (isfmt && va_count > 0 && endpoint_formatted == NULL) {
+ va_list ap, ap2;
+ va_start(ap, endpoint);
+ va_copy(ap2, ap);
+ size_t strlenm = vsnprintf(NULL, 0, endpoint, ap);
+ endpoint_formatted = malloc(sizeof(char)*strlenm+1);
+ vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2);
+ va_end(ap);
+ va_end(ap2);
+ }
+ char * buf = sc_api(c, body, headers, 0, endpoint_formatted ? endpoint_formatted : endpoint);
+ htmlDocPtr htmldoc = parseHtmlDocument(buf, endpoint_formatted ? endpoint_formatted : endpoint);
+ free(buf);
+ free(endpoint_formatted);
+ return htmldoc;
+}
+char * sc_find_class (char * haystack, const char * definition) { /* you must free class after calling */
+ if (!haystack || !definition)
+ return NULL;
+ char * class = strstr(haystack, definition);
+ if (!class)
+ return NULL;
+ int found = 0;
+ for (; class > haystack; class--)
+ if (class[-1] == '.' && (found = 1))
+ break;
+ if (!found)
+ return NULL;
+ char * endofclass = class;
+ found = 0;
+ for (; *endofclass; endofclass++) /* google only has alphanumeric class names. TODO: be pedantic and conformic to w3 stds */
+ if (!isalnum(endofclass[0]) && (found = 1))
+ break;
+ if (!found)
+ return NULL;
+ char * toreturn = malloc(endofclass-class+1);
+ strncpy(toreturn, class, endofclass-class);
+ toreturn[endofclass-class] = '\0';
+ return toreturn;
+}
+int sc_query_google (char * s, struct sc_cache * c) {
+ /*
+ remarks:
+ * we are using wap.google.com over HTTP and with a user-agent string of a nokia mobile phone, so we get a lite website
+ * we determine which class holds a specific value by looking at the css definitions
+ - result title: the only class that has definition {color:#1967D2;font-size:14px;line-height:16px}
+ + A links have this class set, but they have a child SPAN element that then holds the text of the title
+ + A href points to a tracking relative link, starting with /url?q=. the q parameter contains the (obv urlencoded) link.
+ - result date: class has only one definition, {color:#70757a}, but same definition has the class for the settings A link.
+ + extract those two classes and find the one that is only present on SPAN text elements.
+ - result description: once we have the result div, the description is the // span with the appropriate class
+ + the appropriate class is the only one with {word-break:break-word}. note that this class also describes other elements.
+ - result div: to get the result div, we need the parent of the parent of the A link of the title.
+ * result dates are sometimes relative ("an hour ago") and heavily depend on the client location, based on IP.
+ - we won't parse those yet
+ * I couldn't find anything with ratings, so we won't parse thouse either yet
+ * captcha: google knows that this nokia phone we're pretending to be doesn't support javascript
+ - the request limiting captcha must work on a phone without javascript. it is probably loaded inside an iframe, but has
+ origin protection, so we can't just solve it client-side. we would have to proxy images and create some sort of a session
+ based http request-response based user interface so we can ask the user to complete the captcha. this is not yet
+ implemeted and will be hard work.
+ */
+ if (!s || !c)
+ return -1;
+ int rs = 1;
+ char * us = malloc(sizeof(char)*strlen(s)*3+1);
+ urlencode(us, s);
+ char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s", us);
+ // fprintf(stdout, "%s\n", txtdoc);
+ free(us);
+ if (!txtdoc) {
+ rs = -2;
+ goto rc;
+ }
+ char * titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}");
+ if (!titleclass) {
+ SC_LOG(SC_LOG_ERROR, c, "!titleclass");
+ rs = -3;
+ goto rc;
+ }
+#define SC_GTXF "/html/body//a[contains(@class, '%s')]" // @class='fuLhoc ZWRArf'"
+ char * xpath = malloc(strlen(titleclass)+strlen(SC_GTXF));
+ sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */
+ fprintf(stdout, "%s\n", xpath);
+ htmlDocPtr xmldoc = parseHtmlDocument(txtdoc, NULL);
+ xmlXPathObjectPtr nodes = findNodes(xmldoc, xpath);
+ eachNode(nodes, printLinkNode, NULL);
+rc:
+ xmlFreeDoc(xmldoc);
+ free(txtdoc);
+ free(titleclass);
+ free(xpath);
+ return rs;
+}