summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnton Luka Šijanec <anton@sijanec.eu>2022-01-31 19:16:55 +0100
committerAnton Luka Šijanec <anton@sijanec.eu>2022-01-31 19:16:55 +0100
commitf26b60718163cfe454df368099ca1901acafc047 (patch)
tree5446a485e682f281ee6b46e02ea41d397b05f186
parentunicode queries now work 0.0.19 (diff)
downloadsear.c-f26b60718163cfe454df368099ca1901acafc047.tar
sear.c-f26b60718163cfe454df368099ca1901acafc047.tar.gz
sear.c-f26b60718163cfe454df368099ca1901acafc047.tar.bz2
sear.c-f26b60718163cfe454df368099ca1901acafc047.tar.lz
sear.c-f26b60718163cfe454df368099ca1901acafc047.tar.xz
sear.c-f26b60718163cfe454df368099ca1901acafc047.tar.zst
sear.c-f26b60718163cfe454df368099ca1901acafc047.zip
-rw-r--r--.gitignore1
-rw-r--r--debian/changelog7
-rw-r--r--src/api.c26
-rw-r--r--src/hp.php2
-rw-r--r--src/httpd.c60
-rw-r--r--src/i18n.h7
-rw-r--r--src/main.c26
7 files changed, 107 insertions, 22 deletions
diff --git a/.gitignore b/.gitignore
index 7620845..2344f15 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ tmp/
valgrind-out.txt
core
a.out
+.gdb_history
diff --git a/debian/changelog b/debian/changelog
index fdf84d7..dd2f4f0 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,10 @@
+sear.c (0.0.20-1) stable; urgency=low
+
+ * Added fallback option to another server via HTTP redirect in case of
+ CAPTCHA.
+
+ -- Anton Luka Šijanec <anton@sijanec.eu> Tue, 31 Jan 2022 19:00:00 +0200
+
sear.c (0.0.19-1) stable; urgency=low
* Fixed casting to char that overflowed when stepping over strings in
diff --git a/src/api.c b/src/api.c
index a366882..1a8c85a 100644
--- a/src/api.c
+++ b/src/api.c
@@ -124,7 +124,10 @@ int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking
} /* TODO: be pedantic and remove utm_source and other tracking bullshit */
return 1;
}
-struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct sc_query * q, SC_OPT_TYPE opt) { /* check4cachedB4 */
+enum sc_return sc_query_google (const char * s, /* breaking change: changed return type */
+ struct sc_cache * c,
+ struct sc_query * q,
+ SC_OPT_TYPE opt) { /* check4cachedB4 */
/* query is in most cases NULL. then it will be allocated and put into sc_cache. otherwise response will be put into passed q. */
/* if query is not NULL, it MUST be initialized */
/*
@@ -144,7 +147,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
* I couldn't find anything with ratings, so we won't parse thouse either yet
* captcha: google knows that this nokia phone we're pretending to be doesn't support javascript, but does not care, and loads an obfuscated captcha anyways that would be hard to defeat for now without some kind of chromium emulation we really don't want.
*/
- int rs = 1;
+ enum sc_return rs = 1;
char * xpath = NULL;
char * descclass = NULL;
char * titleclass = NULL;
@@ -152,7 +155,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
htmlDocPtr xmldoc = NULL;
char * txtdoc = NULL;
if (!s || !c) {
- rs = -1;
+ rs = SC_BADCALL;
goto rc;
}
int qwasgiven = 0;
@@ -168,15 +171,18 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
free(us);
if (!txtdoc) {
SC_LOG(SC_LOG_ERROR, c, "!txtdoc");
- rs = -2;
+ rs = SC_EMPTYRESPONSE;
+ goto rc;
+ }
+ if (!strstr(txtdoc, "In the meantime, solving the above CAPTCHA will let you continue")) {
+ rs = SC_CAPTCHA;
goto rc;
}
- /* TODO: check if response is asking for a captcha */
if (opt & SC_OPT_IMAGE) {
imageclass = sc_find_class(txtdoc, "{font-family:Roboto,Helvetica,Arial,sans-serif}");
if (!imageclass) {
SC_LOG(SC_LOG_ERROR, c, "!imageclass, txtdoc = %s", txtdoc);
- rs = -3;
+ rs = SC_NOIMGCLASS;
goto rc;
}
} else {
@@ -184,7 +190,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
descclass = sc_find_class(txtdoc, "{word-break:break-word}");
if (!titleclass || !descclass) {
SC_LOG(SC_LOG_ERROR, c, "!titleclass || !descclass, txtdoc = %s", txtdoc);
- rs = -4;
+ rs = SC_NOCLASS;
goto rc;
}
}
@@ -207,7 +213,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
char * hreflink = (char *) xmlGetProp(node, BAD_CAST "href"); /* xmlGetProp copies and allocates */
if (!hreflink) {
SC_LOG(SC_LOG_ERROR, c, "!hreflink");
- rs = -5;
+ rs = SC_NOHREF;
return;
}
if (opt & SC_OPT_IMAGE) {
@@ -218,7 +224,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s
xmlFree(hreflink);
if (!imgurl && !imgrefurl) {
SC_LOG(SC_LOG_ERROR, c, "!imgurl && !imgrefurl, txtdoc = %s", txtdoc);
- /* rs = -6; */ /* we continue running not fail because of a single picture */
+ /* rs = -7; */ /* we continue running not fail because of a single picture */
free(imgurl);
free(imgrefurl);
return; /* check! */
@@ -313,5 +319,5 @@ rc:
free(descclass);
free(imageclass);
free(xpath);
- return (rs < 0) ? NULL : q;
+ return rs;
}
diff --git a/src/hp.php b/src/hp.php
index 900b539..cffa469 100644
--- a/src/hp.php
+++ b/src/hp.php
@@ -1,7 +1,7 @@
<!DOCTYPE html>
<html lang=sl>
<!-- this file is a printf format. be sure to escape percent signs with percent percent. -->
- <!-- this format requires the following types (in order): query string, query string, additional form elements, result info string, results html string -->
+ <!-- this format requires the following types (in order): html title (query string), search box contents (query string), additional form elements, textual title (result info string), textual body (results html string) -->
<head>
<meta charset=UTF-8 />
<title>
diff --git a/src/httpd.c b/src/httpd.c
index 39604ef..23f4b60 100644
--- a/src/httpd.c
+++ b/src/httpd.c
@@ -129,6 +129,7 @@ enum MHD_Result sc_httpd (void * cls,
char add_form[128];
const char * l = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "l");
const char * h = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "h");
+ const char * f = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "f");
snprintf(add_form, 128, "%s%s%d%s", h ? "<input type=hidden name=h value=h />" : "",
l ? "<input type=hidden name=l value=" : "<!-- Odgovor na dokončno vprašanje o Življenju, Vesolju in sploh Vsem je ",
l ? atoi(l) : 42,
@@ -187,24 +188,61 @@ retry:
q = i_am_retarded ? *i_am_retarded : NULL;
#endif
if (q) {
- const char * l = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "l");
- if (MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "f") && q->results_length > 0) {
- mhdrmm = MHD_RESPMEM_PERSISTENT; /* no need to generate HTML if */
- content_type = "text/plain"; /* we have a feeling of luck! */
+ if (f && q->results_length > 0) {
+ mhdrmm = MHD_RESPMEM_PERSISTENT;/* no need to gen HTML */
+ content_type = "text/plain"; /* we have a feeling of luck! */
response = SC_I18N_HORSESHOE_RESPONSE;
status_code = 307;
- location = q->results[0]->url ? q->results[0]->url : SC_I18N_NO_HREFLINK;
+#define LOCHORSE (q->results[0]->url ? q->results[0]->url : SC_I18N_NO_HREFLINK)
+ char * out = alloca(strlen(LOCHORSE)+1);
+ strcpy(out, LOCHORSE);
+ if (h)
+ sc_https2http(LOCHORSE);
+ location = out;
} else
response = sc_queryhtml(q, add_form, atoi(l ? l : "0")); /* MHD_create_response_from_buffer will free response (; */
SC_CUE(c, c->queries_lock);
} else {
SC_CUE(c, c->queries_lock);
- sc_query_google(query, c, NULL, opt);
- if (already_retried++) {
- char * safequery = htmlspecialchars(query);
- response = malloc(strlen((char*) sc_hp)+strlen(safequery)*2+strlen(SC_I18N_HP_ERROR_HEADING)+strlen(SC_I18N_HP_ERROR_BODY)+strlen(add_form));
- sprintf(response, (char *) sc_hp, safequery, safequery, add_form, SC_I18N_HP_ERROR_HEADING, SC_I18N_HP_ERROR_BODY);
- free(safequery);
+ enum sc_return r = sc_query_google(query, c, NULL, opt);
+ if (already_retried++ || r == SC_CAPTCHA) {
+ status_code = 570+ABS(r);
+ if (r == SC_CAPTCHA && strlen(query) < 4096) {
+ if (getenv("SC_FALLBACK")) {
+ status_code = 307;
+ location = alloca(strlen(getenv("SC_FALLBACK"))
+ + 256 + strlen(query)*3);
+ sprintf(location, "%sl=%d&q=", getenv("SC_FALLBACK"),
+ atoi(l ? l : ""));
+ urlencode(location+strlen(location), query);
+ if (opt & SC_OPT_IMAGE)
+ strcat(location, "&i=i");
+ if (h)
+ strcat(location, "&h=h");
+ if (f)
+ strcat(location, "&f=f");
+ }
+ char * safequery = htmlspecialchars(query);
+ response = malloc(strlen((char*) sc_hp)
+ + strlen(safequery) * 2
+ + strlen(SC_I18N_HP_CAPTCHA_HEADING)
+ + strlen(SC_I18N_HP_CAPTCHA_BODY)
+ + strlen(add_form));
+ sprintf(response, (char *) sc_hp, safequery,
+ safequery, add_form, SC_I18N_HP_CAPTCHA_HEADING,
+ SC_I18N_HP_CAPTCHA_BODY);
+ free(safequery);
+ } else {
+ char * safequery = htmlspecialchars(query);
+ response = malloc(strlen((char*) sc_hp) + strlen(safequery)*2
+ + strlen(SC_I18N_HP_ERROR_HEADING)
+ + strlen(SC_I18N_HP_ERROR_BODY)
+ + strlen(add_form));
+ sprintf(response, (char *) sc_hp, safequery, safequery,
+ add_form, SC_I18N_HP_ERROR_HEADING,
+ SC_I18N_HP_ERROR_BODY);
+ free(safequery);
+ }
} else goto retry;
}
}
diff --git a/src/i18n.h b/src/i18n.h
index 5d7e3be..0982e1a 100644
--- a/src/i18n.h
+++ b/src/i18n.h
@@ -20,6 +20,13 @@
#define SC_I18N_HP_ERROR_HEADING "napaka!"
#define SC_I18N_HP_ERROR_BODY "Pridobivanje rezultatov ni uspelo. Mogoče ni rezultatov. " \
"Preberite sistemske dnevnike."
+#define SC_I18N_HP_CAPTCHA_HEADING "zavrnitev zahteve!"
+#define SC_I18N_HP_CAPTCHA_BODY "zahteva za rezultate je bila zavrnjena, ker je zgornjetokovni " \
+ "strežnik zaznal strojno proženje iskanj. Prikazan je bil moderen test CAPTCHA, ki ga ni " \
+ "moč preprosto posredovati uporabniku. Ponavadi v takih primerih čez nekaj minut blokada " \
+ "poteče in z iskanjem lahko nadaljujete. Če pa se to sporočilo ponavlja, pa lahko " \
+ "nastavite okoljsko spremenljivko <code>SC_FALLBACK</code> na naslov strežnika, na " \
+ "katerega naj so ob takih napakah preusmerjena iskanja, dokler niso le-ta spet omogočena."
#define SC_I18N_LOGS "dnevniški zapisi"
#define SC_I18N_LOGS_ERROR "napaka pri branju dnevnikov"
#define SC_I18N_LOGS_NOT_ENABLED "Zbiranje dnevniških zapisov v delovni pomnilnik ni omogočeno. <code>sear.c</code> prevedite z <code>make -e CC=\"cc -DSC_LOGMEM\"</code>; z nastavitvijo zastavice <code>SC_LOGMEM</code> omogočite pregled dnevniških zapisov znotraj aplikacije. Vselej pa se vsi dnevniški zapisi pišejo tudi na standardni izhod, kar se v primeru uporabe <code>sear.c</code> kot <code>systemd</code> storitve shranjuje v sistemske dnevnike."
diff --git a/src/main.c b/src/main.c
index 32c7e96..f57e398 100644
--- a/src/main.c
+++ b/src/main.c
@@ -26,6 +26,32 @@
#endif
#include <lib.c>
#include <url.c>
+#define ABS(x) (((x) < 0) ? -(x) : (x))
+enum sc_return {
+ SC_BADCALL = -1, /* calling the function the wrong way, I could actually just crash */
+ SC_EMPTYRESPONSE = -2, /* something weird, do not retry */
+ SC_CAPTCHA = -3, /* engine responded with a CAPTCHA, redirect to a different server */
+ SC_NOIMGCLASS = -4, /* couldn't find image class name from definition, do not retry */
+ SC_NOCLASS = -5, /* couldn't find result class name from def, do not retry */
+ SC_NOHREF = -6 /* a result did not contain a href attribute */
+};
+const char * sc_return_str (enum sc_return r) {
+ switch (r) {
+ case SC_BADCALL:
+ return "SC_BADCALL";
+ case SC_EMPTYRESPONSE:
+ return "SC_EMPTYRESPONSE";
+ case SC_CAPTCHA:
+ return "SC_CAPTCHA";
+ case SC_NOIMGCLASS:
+ return "SC_NOIMGCLASS";
+ case SC_NOCLASS:
+ return "SC_NOCLASS";
+ case SC_NOHREF:
+ return "SC_NOHREF";
+ }
+ return "SC_BADRETURN";
+}
unsigned char sc_hp[] = { /* html page null terminated format string, from file src/hp.html */
#include <hp.xxd>
};