From bbf6fe2fd069ef89f73ecc3fe3ec2000833f05f4 Mon Sep 17 00:00:00 2001
From: sijanec <anton@sijanec.eu>
Date: Sat, 3 Apr 2021 23:15:48 +0200
Subject: initial release

---
 src/api.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 88 insertions(+), 15 deletions(-)

(limited to 'src/api.c')

diff --git a/src/api.c b/src/api.c
index ae8d619..6ad996e 100644
--- a/src/api.c
+++ b/src/api.c
@@ -107,7 +107,9 @@ char * sc_find_class (char * haystack, const char * definition) { /* you must fr
 	toreturn[endofclass-class] = '\0';
 	return toreturn;
 }
-int sc_query_google (char * s, struct sc_cache * c) {
+struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q) { /* check for cached queries first! */
+	/* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */
+	/* if query is not NULL, it MUST be initialized */
 	/*
 		remarks:
 			* we are using wap.google.com over HTTP and with a user-agent string of a nokia mobile phone, so we get a lite website
@@ -117,7 +119,7 @@ int sc_query_google (char * s, struct sc_cache * c) {
 					+ A href points to a tracking relative link, starting with /url?q=. the q parameter contains the (obv urlencoded) link.
 				- result date: class has only one definition, {color:#70757a}, but same definition has the class for the settings A link.
 					+ extract those two classes and find the one that is only present on SPAN text elements.
-				- result description: once we have the result div, the description is the // span with the appropriate class
+				- result description: once we have the result div, the description is the //table//span with the appropriate class
 					+ the appropriate class is the only one with {word-break:break-word}. note that this class also describes other elements.
 				- result div: to get the result div, we need the parent of the parent of the A link of the title.
 			* result dates are sometimes relative ("an hour ago") and heavily depend on the client location, based on IP.
@@ -129,35 +131,106 @@ int sc_query_google (char * s, struct sc_cache * c) {
 					based http request-response based user interface so we can ask the user to complete the captcha. this is not yet
 					implemeted and will be hard work.
 	*/
-	if (!s || !c)
-		return -1;
-	int rs = 1;
+	int rs;
+	if (!s || !c) {
+		rs = -1;
+		goto rc;
+	}
+	int qwasgiven = 0;
+	if (!q)
+		q = sc_query_init();
+	else
+		qwasgiven++;
 	char * us = malloc(sizeof(char)*strlen(s)*3+1);
 	urlencode(us, s);
+	char * xpath = NULL;
+	char * descclass = NULL;
+	char * titleclass = NULL;
 	char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s", us);
 	// fprintf(stdout, "%s\n", txtdoc);
 	free(us);
 	if (!txtdoc) {
+		SC_LOG(SC_LOG_ERROR, c, "!txtdoc");
 		rs = -2;
 		goto rc;
 	}
-	char * titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}");
-	if (!titleclass) {
-		SC_LOG(SC_LOG_ERROR, c, "!titleclass");
+	titleclass = sc_find_class(txtdoc, "{color:#1967D2;font-size:14px;line-height:16px}");
+	descclass = sc_find_class(txtdoc, "{word-break:break-word}");
+	if (!titleclass || !descclass) {
+		SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass");
 		rs = -3;
 		goto rc;
 	}
-#define SC_GTXF "/html/body//a[contains(@class, '%s')]" // @class='fuLhoc ZWRArf'"
-	char * xpath = malloc(strlen(titleclass)+strlen(SC_GTXF));
+#define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */
+#define SC_GTXD "../..//table//span[@class='%s']"
+#define SC_GTR q->results[q->results_length-1]
+	xpath = malloc(strlen(titleclass)+strlen(SC_GTXF));
 	sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */
-	fprintf(stdout, "%s\n", xpath);
 	htmlDocPtr xmldoc = parseHtmlDocument(txtdoc, NULL);
-	xmlXPathObjectPtr nodes = findNodes(xmldoc, xpath);
-	eachNode(nodes, printLinkNode, NULL);
-rc:
+	if (qwasgiven) /* as you can see, when q is given, queries will be write-locked for the whole XML processing time! */
+		SC_CWLE(c, c->queries_lock);
+	q->results_length = 0;
+	gnu_code_start;
+	eachNodeX(xmldoc, xpath,
+			lambda(void, (xmlNodePtr node, void * data), 
+				{
+					if (node->type == XML_ELEMENT_NODE) {
+						xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href");
+						if (href) {
+							char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href");
+							if (!strncmp(hreflink, "/url?q=", strlen("/url?q="))) {
+								hreflink = hreflink+strlen("/url?q=");
+								*strchrnul(hreflink, '&') = '\0';
+								urldecode(hreflink, hreflink);
+							}
+							char * x = malloc(strlen(descclass)+strlen(SC_GTXD));
+							sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */);
+							xmlNodePtr descnode = nthNodeXN(node, x, 0);
+							free(x);
+							if (q->results_sizeof <= q->results_length)
+								SC_BIGGER_ARRAY(q->results, sc_result);
+							q->results_length++;
+							SC_GTR->query = q;
+							SC_GTR->title = (char *) xmlNodeGetContent(node->children);
+							if (!SC_GTR->title) {
+								SC_GTR->title = malloc(strlen(SC_I18N_NO_TITLE)+1);
+								strcpy(SC_GTR->title, SC_I18N_NO_TITLE);
+							}
+							SC_GTR->url = hreflink;
+							if (!SC_GTR->url) {
+								SC_GTR->url = malloc(strlen(SC_I18N_NO_HREFLINK)+1);
+								strcpy(SC_GTR->url, SC_I18N_NO_HREFLINK);
+							}
+							SC_GTR->desc = (char *) xmlNodeGetContent(descnode);
+							if (!SC_GTR->desc) {
+								SC_GTR->desc = malloc(strlen(SC_I18N_NO_DESCRIPTION)+1);
+								strcpy(SC_GTR->desc, SC_I18N_NO_DESCRIPTION);
+							}
+						}
+					}
+				}
+			),
+		NULL);
+	gnu_code_end;
+	q->cache = c;
+	q->lookup_time = time(NULL);
+	q->engines = SC_ENGINE_GOOGLE;
+	q->string = realloc(q->string, strlen(s)+1);
+	strcpy(q->string, s);
+	if (!qwasgiven) {
+		SC_CWLE(c, c->queries_lock);
+		if (c->queries_sizeof <= c->queries_length)
+			SC_BIGGER_ARRAY(c->queries, sc_query);
+		c->queries_length++;
+#define SC_GTQ c->queries[c->queries_length-1]
+		SC_GTQ = q;
+	}
+	SC_CUE(c, c->queries_lock);
 	xmlFreeDoc(xmldoc);
+rc:
 	free(txtdoc);
 	free(titleclass);
+	free(descclass);
 	free(xpath);
-	return rs;
+	return (rs < 0) ? NULL : q;
 }
-- 
cgit v1.2.3