summaryrefslogblamecommitdiffstats
path: root/src/api.c
blob: 9a9bbc82ca8c9b3d48a9ead508f33d89f4fd97e7 (plain) (tree)

















































                                                                                                                                               
                                                                                                                              

                 
                                 























































                                                                                                                                    















                                                                                                                                            
                                                                                                                                    

                                                                                                                                         








                                                                                                                                                               
                                                                                                                                                  










                                                                                                                                                                 






                                 








                                    

                                                       
                                                                                                                                           


                                           
                                                   


                        














                                                                                                      
         
                                                                      

                                                                   
                                       
                                              


                                                                                                                                



                                                                                                                             






































































                                                                                                                                                         

                                         



                                                                 
                     





                                                                 



                                                    
                     



                                                           
                                                                 




                                              
   


                                 

                         
                        
                         
                    
                                   
 
#define SC_CAPI(c, b, h, e, ...) sc_api(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__)
#define SC_CAPIX(c, b, h, e, ...) sc_capix(c, b, h, 0##__VA_OPT__(1), e __VA_OPT__(,) __VA_ARGS__)
char * sc_api (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) {
	if (!c || !endpoint)
		return NULL;
	size_t va_count = parse_printf_format(endpoint, 0, NULL);
	char * endpoint_formatted = NULL;
	long response_code = 0;
	if (isfmt && va_count > 0 && endpoint_formatted == NULL) {
		va_list ap, ap2;
		va_start(ap, endpoint);
		va_copy(ap2, ap);
		size_t strlenm = vsnprintf(NULL, 0, endpoint, ap);
		endpoint_formatted = malloc(sizeof(char)*strlenm+1);
		vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2);
		va_end(ap);
		va_end(ap2);
	}
	if (!headers)
		headers = "";
	char * hedrs = malloc(sizeof(char)*strlen(headers)+strlen(SC_HTTP_HEADERS)+1);
	strcpy(hedrs, SC_HTTP_HEADERS);
	strcat(hedrs, headers);
	char * contentType = NULL;
	char * redir = NULL;
	char * buf = malloc(sizeof(char)*SC_HTTP_RBUFSIZE);
	size_t buf_sizeof = SC_HTTP_RBUFSIZE;
	size_t buf_length = 0;
	int readstatus = 0;
	void * r = xmlNanoHTTPMethodRedir(
			endpoint_formatted ? endpoint_formatted : endpoint,
			body ? "POST" : "GET",
			body,
			&contentType,
			&redir,
			hedrs,
			body ? strlen(body) : 0
			);
	if (!r) {
		SC_LOG(SC_LOG_ERROR, c, "!r, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint);
		goto rc;
	}
	response_code = xmlNanoHTTPReturnCode(r);
	if (!(response_code - 200 >= 0 && response_code - 200 < 100)) {
		SC_LOG(SC_LOG_ERROR, c, "response_code == %ld, endpoint: %s", response_code, endpoint_formatted ? endpoint_formatted:endpoint);
	}
	while ((readstatus = xmlNanoHTTPRead(r, buf+buf_length, buf_sizeof-buf_length)) > 0) {
		buf_length += readstatus;
		if (buf_sizeof-buf_length < SC_HTTP_RBUFSIZE) {
			buf_sizeof *= SC_REALLOC_K;
			buf = realloc(buf, sizeof(char)*buf_sizeof); /* this IS safe, no matter how hard valgrind complains */
		}
	}
	buf[buf_length++] = '\0';
	if (readstatus == -1)
		SC_LOG(SC_LOG_ERROR, c, "readstatus == -1, endpoint: %s", endpoint_formatted ? endpoint_formatted : endpoint);
	xmlNanoHTTPClose(r);
	SC_LOG(SC_LOG_DEBUG, c, "contentType = %s, redir = %s", contentType ? contentType : "NULL", redir ? redir : "NULL");
rc:
	free(endpoint_formatted);
	free(contentType);
	free(redir);
	free(hedrs);
	return buf;
}
htmlDocPtr sc_capix (struct sc_cache * c, char * body, char * headers, int isfmt, char * endpoint, ...) {
	if (!c || !endpoint)
		return NULL;
	size_t va_count = parse_printf_format(endpoint, 0, NULL);
	char * endpoint_formatted = NULL;
	if (isfmt && va_count > 0 && endpoint_formatted == NULL) {
		va_list ap, ap2;
		va_start(ap, endpoint);
		va_copy(ap2, ap);
		size_t strlenm = vsnprintf(NULL, 0, endpoint, ap);
		endpoint_formatted = malloc(sizeof(char)*strlenm+1);
		vsnprintf(endpoint_formatted, strlenm+1, endpoint, ap2);
		va_end(ap);
		va_end(ap2);
	}
	char * buf = sc_api(c, body, headers, 0, endpoint_formatted ? endpoint_formatted : endpoint);
	htmlDocPtr htmldoc = parseHtmlDocument(buf, endpoint_formatted ? endpoint_formatted : endpoint);
	free(buf);
	free(endpoint_formatted);
	return htmldoc;
}
char * sc_find_class (char * haystack, const char * definition) { /* you must free class after calling */
	if (!haystack || !definition)
		return NULL;
	char * class = strstr(haystack, definition);
	if (!class)
		return NULL;
	int found = 0;
	for (; class > haystack; class--)
		if (class[-1] == '.' && (found = 1))
			break;
	if (!found)
		return NULL;
	char * endofclass = class;
	found = 0;
	for (; *endofclass; endofclass++) /* google only has alphanumeric class names. TODO: be pedantic and conformic to w3 stds */
		if (!isalnum(endofclass[0]) && (found = 1))
			break;
	if (!found)
		return NULL;
	char * toreturn = malloc(endofclass-class+1);
	strncpy(toreturn, class, endofclass-class);
	toreturn[endofclass-class] = '\0';
	return toreturn;
}
int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking nonsense, so resulting URL is shorter or equl) */
	if (!h || !*h) /* stage 0: prevent accidental death */
		return -1;
	if (!strncmp(*h, "/url?q=", strlen("/url?q="))) { /* stage 1: url may be tracking url by google results */
		*h = *h+strlen("/url?q=");
		*strchrnul(*h, '&') = '\0';
		urldecode(*h, *h);
	}
	char * c = NULL;
	if ((c = strstr(*h, "googleweblight.com/fp?u="))) { /* stage 2: url may be "light web" tracking url by google results */
		*h = c+strlen("googleweblight.com/fp?u="); /* we could disable this with a cookie but meh, this is easier and _stateless_ */
		*strchrnul(*h, '&') = '\0';
		urldecode(*h, *h);
	} /* TODO: be pedantic and remove utm_source and other tracking bullshit */
	return 1;
}
struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q, SC_OPT_TYPE opt) { /* check4cachedB4 */
	/* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */
	/* if query is not NULL, it MUST be initialized */
	/*
		remarks:
			* we are using wap.google.com over HTTP and with a user-agent string of a nokia mobile phone, so we get a lite website
			* we determine which class holds a specific value by looking at the css definitions
				- result title: the only class that has definition {color:#1967D2;font-size:14px;line-height:16px}
					+ A links have this class set, but they have a child SPAN element that then holds the text of the title
					+ A href points to a tracking relative link, starting with /url?q=. the q parameter contains the (obv urlencoded) link.
				- result date: class has only one definition, {color:#70757a}, but same definition has the class for the settings A link.
					+ extract those two classes and find the one that is only present on SPAN text elements.
				- result description: once we have the result div, the description is the //table//span with the appropriate class
					+ the appropriate class is the only one with {word-break:break-word}. note that this class also describes other elements.
				- result div: to get the result div, we need the parent of the parent of the A link of the title.
			* result dates are sometimes relative ("an hour ago") and heavily depend on the client location, based on IP.
				- we won't parse those yet
			* I couldn't find anything with ratings, so we won't parse thouse either yet
			* captcha: google knows that this nokia phone we're pretending to be doesn't support javascript
				- the request limiting captcha must work on a phone without javascript. it is probably loaded inside an iframe, but has
					origin protection, so we can't just solve it client-side. we would have to proxy images and create some sort of a session
					based http request-response based user interface so we can ask the user to complete the captcha. this is not yet
					implemeted and will be hard work.
	*/
	int rs = 1;
	char * xpath = NULL;
	char * descclass = NULL;
	char * titleclass = NULL;
	char * imageclass = NULL;
	htmlDocPtr xmldoc = NULL;
	char * txtdoc = NULL;
	if (!s || !c) {
		rs = -1;
		goto rc;
	}
	int qwasgiven = 0;
	if (!q)
		q = sc_query_init();
	else
		qwasgiven++;
	char * us = malloc(sizeof(char)*strlen(s)*3+1);
	urlencode(us, s);
	txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100&ie=UTF-8%s", us, (opt&SC_OPT_IMAGE) ? "&tbm=isch" : "");
	// fprintf(stdout, "%s\n", txtdoc);
	free(us);
	if (!txtdoc) {
		SC_LOG(SC_LOG_ERROR, c, "!txtdoc");
		rs = -2;
		goto rc;
	}
	if (opt & SC_OPT_IMAGE) {
		imageclass = sc_find_class(txtdoc, "{font-family:Roboto,Helvetica,Arial,sans-serif}");
		if (!imageclass) {
			SC_LOG(SC_LOG_ERROR, c, "!imageclass, txtdoc = %s", txtdoc);
			rs = -3;
			goto rc;
		}
	} else {
		titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}");
		descclass = sc_find_class(txtdoc, "{word-break:break-word}");
		if (!titleclass || !descclass) {
			SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass");
			rs = -4;
			goto rc;
		}
	}
#define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */
#define SC_GTXD /* description */ "../..//table//span[@class='%s']"
#define SC_GTXB /* breadcrumbs */ ".//span[@class='%s']"
#define SC_GTXI "//div[@class='%s']//a"
#define SC_GTR q->results[q->results_length-1]
	xpath = malloc(strlen((opt & SC_OPT_IMAGE) ? imageclass : titleclass)+strlen((opt & SC_OPT_IMAGE) ? SC_GTXI : SC_GTXF));
	sprintf(xpath, (opt & SC_OPT_IMAGE) ? SC_GTXI : SC_GTXF, (opt & SC_OPT_IMAGE) ? imageclass : titleclass);
	xmldoc = parseHtmlDocument(txtdoc, NULL);
	if (qwasgiven) /* as you can see, when q is given, queries will be write-locked for the whole XML processing time! */
		SC_CWLE(c, c->queries_lock);
	q->results_length = 0;
	gnu_code_start;
	void sc_query_google_eachnode (xmlNodePtr node, void * data) {
		if (node->type == XML_ELEMENT_NODE) {
			xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href");
			if (href) {
				char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* xmlGetProp copies and allocates */
				if (!hreflink) {
					SC_LOG(SC_LOG_ERROR, c, "!hreflink");
					rs = -5;
					return;
				}
				if (opt & SC_OPT_IMAGE) {
					char * imgurl = NULL; /* do not free those when allocated by sscanf, as they will directly go into the struct. */
					char * imgrefurl = NULL; /* easy, huh? */
					SC_LOG(SC_LOG_DEBUG, c, "hreflink = %s", hreflink);
					sscanf(hreflink, "/imgres?imgurl=%m[^&]&imgrefurl=%m[^&]", &imgurl, &imgrefurl);
					if (!imgurl && !imgrefurl) {
						SC_LOG(SC_LOG_ERROR, c, "!imgurl && !imgrefurl");
						/* rs = -6; */ /* we continue running not fail because of a single picture */
						free(imgurl);
						free(imgrefurl);
						return; /* check! */
					}
					urldecode(imgurl, imgurl);
					urldecode(imgrefurl, imgrefurl);
					if (q->results_sizeof <= q->results_length)
						SC_BIGGER_ARRAY(q->results, sc_result, 1);
					q->results_length++;
					SC_GTR->query = q;
					SC_GTR->title = NULL; /* can't get title from here, would have to load /imgres, which is bloat */
					SC_GTR->url = imgrefurl;
					SC_GTR->desc = imgurl;
					SC_GTR->breadcrumbs = NULL;
				} else {
					char * orig_hreflink_for_free = hreflink;
					sc_fix_url(&hreflink);
					char * x = malloc(strlen(descclass)+strlen(SC_GTXD));
					char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB));
					sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */);
					sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */);
					xmlNodePtr descnode = nthNodeXN(node, x, 0);
					if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */
						descnode = nthNodeXN(node, "../../div/div", 0);
					xmlNodePtr breadnode = nthNodeXN(node, xbread, 0);
					free(x);
					free(xbread);
					if (q->results_sizeof <= q->results_length)
						SC_BIGGER_ARRAY(q->results, sc_result, 1);
					q->results_length++;
					SC_GTR->query = q;
					char * cp = (char *) xmlNodeGetContent(node->children);
					if (cp) {
						SC_GTR->title = malloc(strlen(cp)+1);
						strcpy(SC_GTR->title, cp);
						xmlFree(cp);
					} else SC_GTR->title = NULL;
					if (hreflink) {
						SC_GTR->url = malloc(strlen(hreflink)+1);
						strcpy(SC_GTR->url, hreflink);
						xmlFree(orig_hreflink_for_free);
					} else SC_GTR->url = NULL;
					cp = (char *) xmlNodeGetContent(descnode);
					if (cp) {
						SC_GTR->desc = malloc(strlen(cp)+1);
						strcpy(SC_GTR->desc, cp);
						xmlFree(cp);
					} else SC_GTR->desc = NULL;
					cp = (char *) xmlNodeGetContent(breadnode);
					if (cp) {
						SC_GTR->breadcrumbs = malloc(strlen(cp)+1);
						strcpy(SC_GTR->breadcrumbs, cp);
						xmlFree(cp);
					}
				}
			}
		}
	}
	eachNodeX(xmldoc, xpath, sc_query_google_eachnode, NULL);
	gnu_code_end;
	if (rs < 0) {
		SC_LOG(SC_LOG_ERROR, c, "rs < 0 (rs == %d)", rs);
		if (qwasgiven)
			SC_CUE(c, c->queries_lock);
		goto rc;
	}
	q->cache = c;
	q->lookup_time = time(NULL);
	q->engines = SC_ENGINE_GOOGLE;
	q->string = realloc(q->string, strlen(s)+1);
	q->opt = opt;
	strcpy(q->string, s);
	if (!qwasgiven) {
		SC_CWLE(c, c->queries_lock);
		if (c->queries_sizeof <= c->queries_length)
			SC_BIGGER_ARRAY(c->queries, sc_query, 0);
		c->queries_length++;
#define SC_GTQ c->queries[c->queries_length-1]
		SC_GTQ = q;
	}
	SC_CUE(c, c->queries_lock);
rc:
	if (!qwasgiven && rs < 0)
		sc_query_free(q);
	xmlFreeDoc(xmldoc);
	free(txtdoc);
	free(titleclass);
	free(descclass);
	free(imageclass);
	free(xpath);
	return (rs < 0) ? NULL : q;
}