From 4916ba77785633cd0d74a05802d0fc08764f40dc Mon Sep 17 00:00:00 2001 From: sijanec Date: Sun, 4 Apr 2021 22:52:39 +0200 Subject: number of results=100, css fixes, breadcrumbs, removed googleweblight, desc on top fix --- src/api.c | 40 ++++++++++++++++++++++++++++++++-------- src/hp.html | 15 ++++++++++++++- src/httpd.c | 10 +++++++--- src/structs.c | 7 +++++++ 4 files changed, 60 insertions(+), 12 deletions(-) diff --git a/src/api.c b/src/api.c index 6ad996e..76431cc 100644 --- a/src/api.c +++ b/src/api.c @@ -107,6 +107,22 @@ char * sc_find_class (char * haystack, const char * definition) { /* you must fr toreturn[endofclass-class] = '\0'; return toreturn; } +int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking nonsense, so resulting URL is shorter or equl) */ + if (!h || !*h) /* stage 0: prevent accidental death */ + return -1; + if (!strncmp(*h, "/url?q=", strlen("/url?q="))) { /* stage 1: url may be tracking url by google results */ + *h = *h+strlen("/url?q="); + *strchrnul(*h, '&') = '\0'; + urldecode(*h, *h); + } + char * c = NULL; + if ((c = strstr(*h, "googleweblight.com/fp?u="))) { /* stage 2: url may be "light web" tracking url by google results */ + *h = c+strlen("googleweblight.com/fp?u="); /* we could disable this with a cookie but meh, this is easier and _stateless_ */ + *strchrnul(*h, '&') = '\0'; + urldecode(*h, *h); + } /* TODO: be pedantic and remove utm_source and other tracking bullshit */ + return 1; +} struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q) { /* check for cached queries first! */ /* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */ /* if query is not NULL, it MUST be initialized */ @@ -146,7 +162,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s char * xpath = NULL; char * descclass = NULL; char * titleclass = NULL; - char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s", us); + char * txtdoc = SC_CAPI(c, NULL, NULL, "http://wap.google.com/search?q=%s&num=100", us); // fprintf(stdout, "%s\n", txtdoc); free(us); if (!txtdoc) { @@ -162,7 +178,8 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s goto rc; } #define SC_GTXF "/html/body//a[contains(@class, '%s')]" /* result a */ -#define SC_GTXD "../..//table//span[@class='%s']" +#define SC_GTXD /* description */ "../..//table//span[@class='%s']" +#define SC_GTXB /* breadcrumbs */ ".//span[@class='%s']" #define SC_GTR q->results[q->results_length-1] xpath = malloc(strlen(titleclass)+strlen(SC_GTXF)); sprintf(xpath, SC_GTXF, titleclass); /* whenever starts with titleclas */ @@ -177,16 +194,18 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s if (node->type == XML_ELEMENT_NODE) { xmlAttrPtr href = xmlHasProp(node, BAD_CAST "href"); if (href) { - char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); - if (!strncmp(hreflink, "/url?q=", strlen("/url?q="))) { - hreflink = hreflink+strlen("/url?q="); - *strchrnul(hreflink, '&') = '\0'; - urldecode(hreflink, hreflink); - } + char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* fuck rules, I will rewrite it anyways <= hi future me */ + sc_fix_url(&hreflink); char * x = malloc(strlen(descclass)+strlen(SC_GTXD)); + char * xbread = malloc(strlen(descclass)+strlen(SC_GTXB)); sprintf(x, SC_GTXD, descclass /* remember, kids, GNU C is fucking legendary */); + sprintf(xbread, SC_GTXB, descclass /* remember, kids, GNU C is fucking legendary */); xmlNodePtr descnode = nthNodeXN(node, x, 0); + if (!descnode) /* description may be above, see https://support.google.com/websearch?p=featured_snippets */ + descnode = nthNodeXN(node, "../../div/div", 0); + xmlNodePtr breadnode = nthNodeXN(node, xbread, 0); free(x); + free(xbread); if (q->results_sizeof <= q->results_length) SC_BIGGER_ARRAY(q->results, sc_result); q->results_length++; @@ -206,6 +225,11 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s SC_GTR->desc = malloc(strlen(SC_I18N_NO_DESCRIPTION)+1); strcpy(SC_GTR->desc, SC_I18N_NO_DESCRIPTION); } + SC_GTR->breadcrumbs = (char *) xmlNodeGetContent(breadnode); + if (!SC_GTR->breadcrumbs) { + SC_GTR->breadcrumbs = malloc(strlen(SC_GTR->url)+1); + strcpy(SC_GTR->breadcrumbs, SC_GTR->url); + } } } } diff --git a/src/hp.html b/src/hp.html index d2bc82f..47aff62 100644 --- a/src/hp.html +++ b/src/hp.html @@ -14,7 +14,7 @@ diff --git a/src/httpd.c b/src/httpd.c index bf5c3d1..656ad92 100644 --- a/src/httpd.c +++ b/src/httpd.c @@ -9,13 +9,17 @@ char * sc_queryhtml (struct sc_query * q) { /* remember to free returned string string##_sizeof = (string##_written+wanted+1)*SC_REALLOC_K; \ string = realloc(string, string##_sizeof); \ } -#define SC_HRF "

%s

%s

" +#define SC_HRF "

%s " \ + "%s

%s

" +#define SC_HRA i, safeurl, i, safetitle, safebreadcrumbs, safebody char * safetitle = htmlspecialchars(q->results[i]->title); char * safebody = htmlspecialchars(q->results[i]->desc); char * safeurl = htmlspecialchars(q->results[i]->url); - size_t ws = snprintf(NULL, 0, SC_HRF, safeurl, safetitle, safebody); + char * safebreadcrumbs = htmlspecialchars(q->results[i]->breadcrumbs); + size_t ws = snprintf(NULL, 0, SC_HRF, SC_HRA); SC_HRC(resultshtml, ws); - resultshtml_written += sprintf(resultshtml+resultshtml_written, SC_HRF, safeurl, safetitle, safebody); + resultshtml_written += sprintf(resultshtml+resultshtml_written, SC_HRF, SC_HRA); + free(safebreadcrumbs); free(safetitle); free(safebody); free(safeurl); diff --git a/src/structs.c b/src/structs.c index b99f1eb..2d83f74 100644 --- a/src/structs.c +++ b/src/structs.c @@ -36,9 +36,14 @@ struct sc_result { time_t date; /* some search engines like to extract a date from a website, store that here - not implemented */ unsigned short int rating; /* some search engines like to extract a rating from a website, store that here */ /* not implementd */ unsigned short int rating_max; /* max rating when above is used /\ */ /* not implemented yet */ + char * breadcrumbs; /* yesfree - google has nice breadcrumbs, when hovering over the URL requires too much time (: */ }; struct sc_result * sc_result_init () { struct sc_result * r = calloc(1, sizeof(struct sc_result)); + r->url = NULL; + r->desc = NULL; + r->title = NULL; + r->breadcrumbs = NULL; return r; } int sc_result_free (struct sc_result * r) { @@ -47,6 +52,7 @@ int sc_result_free (struct sc_result * r) { free(r->url); free(r->desc); free(r->title); + free(r->breadcrumbs); free(r); return 1; } @@ -65,6 +71,7 @@ struct sc_query * sc_query_init () { q->results[i] = sc_result_init(); q->results[i]->query = q; } + q->string = NULL; return q; } int sc_query_free (struct sc_query * q) { -- cgit v1.2.3