summaryrefslogtreecommitdiffstats
path: root/src/api.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/api.c')
-rw-r--r--src/api.c40
1 files changed, 32 insertions, 8 deletions
diff --git a/src/api.c b/src/api.c
index 6ad996e..76431cc 100644
--- a/src/api.c
+++ b/src/api.c
@@ -107,6 +107,22 @@ char * sc_find_class (char * haystack, const char * definition) { /* you must fr
toreturn[endofclass-class] = '\0';
return toreturn;
}
+int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking nonsense, so resulting URL is shorter or equl) */
+ if (!h || !*h) /* stage 0: prevent accidental death */
+ return -1;
+ if (!strncmp(*h, "/url?q=", strlen("/url?q="))) { /* stage 1: url may be tracking url by google results */
+ *h = *h+strlen("/url?q=");
+ *strchrnul(*h, '&') = '\0';
+ urldecode(*h, *h);
+ }
+ char * c = NULL;
+ if ((c = strstr(*h, "googleweblight.com/fp?u="))) { /* stage 2: url may be "light web" tracking url by google results */
+ *h = c+strlen("googleweblight.com/fp?u="); /* we could disable this with a cookie but meh, this is easier and _stateless_ */
+ *strchrnul(*h, '&') = '\0';
+ urldecode(*h, *h);
+ } /* TODO: be pedantic and remove utm_source and other tracking bullshit */
+ return 1;
+}
struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q) { /* check for cached queries first! */
/* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */
/* if query is not NULL, it MUST be initialized */
@@ -146,7 +162,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
char * xpath = NULL;
char * descclass = NULL;
char * titleclass = NULL;
- char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s", us);
+ char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100", us);
// fprintf(stdout, "%s\n", txtdoc);
free(us);
if (!txtdoc) {
@@ -162,7 +178,8 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
goto rc;
}
#define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */
-#define SC_GTXD "../..//table//span[@class='%s']"
+#define SC_GTXD /* description */ "../..//table//span[@class='%s']"
+#define SC_GTXB /* breadcrumbs */ ".//span[@class='%s']"
#define SC_GTR q->results[q->results_length-1]
xpath = malloc(strlen(titleclass)+strlen(SC_GTXF));
sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */
@@ -177,16 +194,18 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
if (node->type == XML_ELEMENT_NODE) {
xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href");
if (href) {
- char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href");
- if (!strncmp(hreflink, "/url?q=", strlen("/url?q="))) {
- hreflink = hreflink+strlen("/url?q=");
- *strchrnul(hreflink, '&') = '\0';
- urldecode(hreflink, hreflink);
- }
+ char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* fuck rules, I will rewrite it anyways <= hi future me */
+ sc_fix_url(&hreflink);
char * x = malloc(strlen(descclass)+strlen(SC_GTXD));
+ char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB));
sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */);
+ sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */);
xmlNodePtr descnode = nthNodeXN(node, x, 0);
+ if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */
+ descnode = nthNodeXN(node, "../../div/div", 0);
+ xmlNodePtr breadnode = nthNodeXN(node, xbread, 0);
free(x);
+ free(xbread);
if (q->results_sizeof <= q->results_length)
SC_BIGGER_ARRAY(q->results, sc_result);
q->results_length++;
@@ -206,6 +225,11 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
SC_GTR->desc = malloc(strlen(SC_I18N_NO_DESCRIPTION)+1);
strcpy(SC_GTR->desc, SC_I18N_NO_DESCRIPTION);
}
+ SC_GTR->breadcrumbs = (char *) xmlNodeGetContent(breadnode);
+ if (!SC_GTR->breadcrumbs) {
+ SC_GTR->breadcrumbs = malloc(strlen(SC_GTR->url)+1);
+ strcpy(SC_GTR->breadcrumbs, SC_GTR->url);
+ }
}
}
}