From a10a8fb335e5a817e1a9add49ee179394eea67c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Anton=20Luka=20=C5=A0ijanec?= Date: Sun, 26 Dec 2021 19:52:31 +0100 Subject: fixed parser, fixed leak, O(log n) storage - tsearch(3) - 0.0.17 --- .gitignore | 1 + README.md | 6 ++-- src/api.c | 18 ++++++++---- src/httpd.c | 24 ++++++++++++---- src/i18n.h | 10 ++++++- src/log.c | 4 +-- src/main.c | 5 +++- src/structs.c | 91 +++++++++++++++++++++++++++++++++++++++++++++------------- test/tsearch.c | 15 ++++++++++ 9 files changed, 135 insertions(+), 39 deletions(-) create mode 100644 test/tsearch.c diff --git a/.gitignore b/.gitignore index 5e9d484..7620845 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ sear.c tmp/ valgrind-out.txt core +a.out diff --git a/README.md b/README.md index 8c96e3f..adac2e9 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ service sear.c start ## requirements * a POSIX system -* GNU C library +* GNU C library (uses `tdestroy(3)` if compiled without `SC_OLD_STORAGE`) * GNU compiler collection (it's written in GNU C - it uses nested functions) * GNU Make * libxml2-dev (for the simple HTML/1.0 client and HTML parser) @@ -32,12 +32,12 @@ make ## instructions -* run the daemon - it starts listening on HTTP port 7327 (remember it by picturing phone keyboard buttons with letters SEAR (; ) +* run the daemon - it starts listening on HTTP port 7327 (remember it by picturing phone keyboard buttons with letters SEAR (; ) - port can be set with the environment variable `SC_PORT` * optional: create a reverse proxy for HTTPS * navigate to [http://localhost:7327](http://localhost:7327) and do a couple of searches to see if everything works * the horseshoe button redirects directly to the first result without wasting time on the results page. use if you feel lucky. (BP) * the painting button performs a search for images. PRIVACY WARNING: images are loaded directly from servers (not from google) -* program writes all logs to standard error +* program writes all logs to standard error (and to `/logs.html` if compiled with `SC_LOGMEM`) * setting the h parameter will rewrite links to HTTP from HTTPS * setting the l parameter with a number will limit number of displayed links to that number. diff --git a/src/api.c b/src/api.c index 5ab4083..a366882 100644 --- a/src/api.c +++ b/src/api.c @@ -87,7 +87,7 @@ htmlDocPtr sc_capix (struct sc_cache * c, char * body, char * headers, int isfmt char * sc_find_class (char * haystack, const char * definition) { /* you must free class after calling */ if (!haystack || !definition) return NULL; - char * class = strstr(haystack, definition); + char * class = strcasestr(haystack, definition); if (!class) return NULL; int found = 0; @@ -117,7 +117,7 @@ int sc_fix_url (char ** h) { /* fixes a (result) URL in-place (removes tracking urldecode(*h, *h); } char * c = NULL; - if ((c = strstr(*h, "googleweblight.com/fp?u="))) { /* stage 2: url may be "light web" tracking url by google results */ + if ((c = strcasestr(*h, "googleweblight.com/fp?u="))) { /* stage 2: url may be "light web" tracking url by google results */ *h = c+strlen("googleweblight.com/fp?u="); /* we could disable this with a cookie but meh, this is easier and _stateless_ */ *strchrnul(*h, '&') = '\0'; urldecode(*h, *h); @@ -156,6 +156,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s goto rc; } int qwasgiven = 0; + int sl = strlen(s); if (!q) q = sc_query_init(); else @@ -214,6 +215,7 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s char * imgrefurl = NULL; /* easy, huh? */ SC_LOG(SC_LOG_DEBUG, c, "hreflink = %s", hreflink); sscanf(hreflink, "/imgres?imgurl=%m[^&]&imgrefurl=%m[^&]", &imgurl, &imgrefurl); + xmlFree(hreflink); if (!imgurl && !imgrefurl) { SC_LOG(SC_LOG_ERROR, c, "!imgurl && !imgrefurl, txtdoc = %s", txtdoc); /* rs = -6; */ /* we continue running not fail because of a single picture */ @@ -257,8 +259,9 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s if (hreflink) { SC_GTR->url = malloc(strlen(hreflink)+1); strcpy(SC_GTR->url, hreflink); - xmlFree(orig_hreflink_for_free); } else SC_GTR->url = NULL; + if (orig_hreflink_for_free) + xmlFree(orig_hreflink_for_free); cp = (char *) xmlNodeGetContent(descnode); if (cp) { SC_GTR->desc = malloc(strlen(cp)+1); @@ -285,17 +288,20 @@ struct sc_query * sc_query_google (const char * s, struct sc_cache * c, struct s } q->cache = c; q->lookup_time = time(NULL); - q->engines = SC_ENGINE_GOOGLE; - q->string = realloc(q->string, strlen(s)+1); - q->opt = opt; + q->string = realloc(q->string, sl+1); + q->opt |= opt | SC_ENGINE_GOOGLE; strcpy(q->string, s); if (!qwasgiven) { SC_CWLE(c, c->queries_lock); +#ifdef SC_OLD_STORAGE if (c->queries_sizeof <= c->queries_length) SC_BIGGER_ARRAY(c->queries, sc_query, 0); c->queries_length++; #define SC_GTQ c->queries[c->queries_length-1] SC_GTQ = q; +#else /* we don't detect here if query is already stored, but it should not be ... */ + tsearch(q, &c->qrp, SC_COMPAR_CAST sc_query_compar); +#endif } SC_CUE(c, c->queries_lock); rc: diff --git a/src/httpd.c b/src/httpd.c index 514f57a..39604ef 100644 --- a/src/httpd.c +++ b/src/httpd.c @@ -3,7 +3,7 @@ char * sc_https2http (char * i) { memmove(i+4, i+5, strlen(i)-3); return i; } -char * sc_queryhtml (struct sc_query * q, const char * add_form, size_t l) { /* remember to free returned string in the caller */ /* caller takes care of freeing */ +char * sc_queryhtml (const struct sc_query * q, const char * add_form, size_t l) { /* remember to free returned string in the caller */ /* caller takes care of freeing */ size_t resultshtml_written = 0; size_t resultshtml_sizeof = SC_ALLOC_CHUNK; char * resultshtml = malloc(resultshtml_sizeof); @@ -120,12 +120,12 @@ enum MHD_Result sc_httpd (void * cls, char * location = "//git.sijanec.eu/sijanec/sear.c"; char * content_type = "text/html"; int status_code = MHD_HTTP_OK; - SC_OPT_TYPE opt = 0; + SC_OPT_TYPE opt = SC_OPT_INIT; if (MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "i")) opt |= SC_OPT_IMAGE; if (!host) host = ""; - struct sc_query * q = NULL; + const struct sc_query * q = NULL; char add_form[128]; const char * l = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "l"); const char * h = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "h"); @@ -172,18 +172,30 @@ enum MHD_Result sc_httpd (void * cls, } } else { int already_retried = 0; + const struct sc_query query_to_find = { + .string = (char *) query, + .opt = opt + }; retry: SC_CRLE(c, c->queries_lock); +#ifdef SC_OLD_STORAGE for (size_t i = 0; i < c->queries_length; i++) - if (!strcmp(c->queries[i]->string, query) && c->queries[i]->opt == opt) + if (!sc_query_compar(c->queries[i], &query_to_find)) q = c->queries[i]; +#else /* tfind(3) also requires a pointer to the variable that holds rootp! */ + const struct sc_query ** i_am_retarded = tfind(&query_to_find, &c->qrp, SC_COMPAR_CAST sc_query_compar); + q = i_am_retarded ? *i_am_retarded : NULL; +#endif if (q) { const char * l = MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "l"); - response = sc_queryhtml(q, add_form, atoi(l ? l : "0")); /* MHD_create_response_from_buffer will free response (; */ if (MHD_lookup_connection_value(connection, MHD_GET_ARGUMENT_KIND, "f") && q->results_length > 0) { + mhdrmm = MHD_RESPMEM_PERSISTENT; /* no need to generate HTML if */ + content_type = "text/plain"; /* we have a feeling of luck! */ + response = SC_I18N_HORSESHOE_RESPONSE; status_code = 307; location = q->results[0]->url ? q->results[0]->url : SC_I18N_NO_HREFLINK; - } + } else + response = sc_queryhtml(q, add_form, atoi(l ? l : "0")); /* MHD_create_response_from_buffer will free response (; */ SC_CUE(c, c->queries_lock); } else { SC_CUE(c, c->queries_lock); diff --git a/src/i18n.h b/src/i18n.h index 8e97c33..5d7e3be 100644 --- a/src/i18n.h +++ b/src/i18n.h @@ -2,8 +2,15 @@ #define SC_I18N_NO_HREFLINK "/? ni hiperpovezave" #define SC_I18N_NO_DESCRIPTION "ni opisa" #define SC_I18N_HP_HEADING "dobrodošli na prvo stran sear.c" +#ifdef SC_OLD_STORAGE +#define SC_I18N_STORAGE "preprost O(n) iskalnik po seznamu. " +#else +#define SC_I18N_STORAGE "napreden POSIX tsearch(3) iskalnik po binarnem drevesu. " +#endif #define SC_I18N_HP_BODY "sear.c je program za anonimizacijo in predpomnenje rezultatov spletnih iskalnikov. " \ - "Za uporabo nekaj vnesite v iskalno vrstico zgoraj in pritisnite gumb za iskanje." + "Za uporabo nekaj vnesite v iskalno vrstico zgoraj in pritisnite gumb za iskanje. " \ + "
Uporabljen algoritem za predpomnjenje rezultatov je " SC_I18N_STORAGE \ + "Na izbiro algoritma med prevodom vplivate z zastavico SC_OLD_STORAGE." #define SC_I18N_NUMBER_OF_RESULTS "število zadetkov" #define SC_I18N_QUERY_TIME "čas poizvedbe" #define SC_I18N_DATETIME_FORMAT "%c" @@ -17,3 +24,4 @@ #define SC_I18N_LOGS_ERROR "napaka pri branju dnevnikov" #define SC_I18N_LOGS_NOT_ENABLED "Zbiranje dnevniških zapisov v delovni pomnilnik ni omogočeno. sear.c prevedite z make -e CC=\"cc -DSC_LOGMEM\"; z nastavitvijo zastavice SC_LOGMEM omogočite pregled dnevniških zapisov znotraj aplikacije. Vselej pa se vsi dnevniški zapisi pišejo tudi na standardni izhod, kar se v primeru uporabe sear.c kot systemd storitve shranjuje v sistemske dnevnike." #define SC_I18N_GIT_URL "//git.sijanec.eu/sijanec/sear.c" +#define SC_I18N_HORSESHOE_RESPONSE "Če vidite to besedilo, vaš brskalnik ne podpira preusmeritev. V tem primeru ne uporabljajte argumenta f." diff --git a/src/log.c b/src/log.c index e7ee4f8..4c46804 100644 --- a/src/log.c +++ b/src/log.c @@ -1,5 +1,5 @@ -const char * sc_log_str (int t) { - switch (t) { +const char * sc_log_str (SC_OPT_TYPE t) { + switch (t & SC_LOG_MASK) { case SC_LOG_ERROR: return "SC_LOG_ERROR"; case SC_LOG_WARNING: diff --git a/src/main.c b/src/main.c index 6576fb8..32c7e96 100644 --- a/src/main.c +++ b/src/main.c @@ -21,6 +21,9 @@ #include #include #include +#ifndef SC_OLD_STORAGE +#include +#endif #include #include unsigned char sc_hp[] = { /* html page null terminated format string, from file src/hp.html */ @@ -39,7 +42,7 @@ char sc_securitytxt[] = "# This content information is provided by the developer "Encryption: https://www.sijanec.eu/pgp-key.txt\n" "Expires: Thu, 31 Dec 2021 18:37:07 -0800\n" "Preferred-Languages: sl, en, de, hr\n"; -#define SC_HTTP_PORT 7327 /* SEAR on mobile keyboard */ +#define SC_HTTP_PORT (getenv("SC_PORT") ? atoi(getenv("SC_PORT")) : 7327) /* SEAR on mobile keyboard */ #define SC_HTTP_RBUFSIZE 4096 /* initial size of http read buffer, increasning by K */ #define SC_HTTP_USER_AGENT "Nokia WAP Gateway 4.1 CD1/ECD13_D/4.1.04)" /* so google and others sends a minimal response */ #define SC_HTTP_HEADERS "User-Agent: " SC_HTTP_USER_AGENT "\r\n" diff --git a/src/structs.c b/src/structs.c index 83d19b9..650eb88 100644 --- a/src/structs.c +++ b/src/structs.c @@ -4,22 +4,28 @@ #define SC_CRLE(c, name) (pthread_rwlock_rdlock(name) ? (SC_LOG(SC_LOG_ERROR,c,SC_I18N_LOCKING " " #name " " SC_I18N_FAILED)||1) :0) #define SC_CUE(c, name) (pthread_rwlock_unlock(name) ? (SC_LOG(SC_LOG_ERROR,c,SC_I18N_UNLOCKING " " #name " " SC_I18N_FAILED)||1):0) #define SC_REALLOC_K 1.5 /* constant to dynamically realloc large arrays (new size = current size * K) */ -#define SC_ENGINE_GOOGLE (1 << 0) /* _Atomic(size_t) sc_mem_max = 100e6; */ /* the really soft memory limit of the program: 100MB - NOT IMPLEMENTED */ #define SC_LOG(t, c, m, ...) sc_push_log(t, c, __func__, __FILE__, __LINE__, 0##__VA_OPT__(1), m __VA_OPT__(,) __VA_ARGS__) -#define SC_LOG_ERROR (1 << 0) -#define SC_LOG_WARNING (1 << 1) -#define SC_LOG_INFO (1 << 2) -#define SC_LOG_DEBUG (1 << 3) #define SC_BIGGER_ARRAY(name, type, shallinit) do { \ name = realloc(name, sizeof(name[0])*ceil(name##_sizeof*SC_REALLOC_K)); \ for (size_t i = name##_sizeof; shallinit && (i < ceil(name##_sizeof*SC_REALLOC_K)); i++) \ name[i] = type##_init(); \ name##_sizeof = ceil(name##_sizeof*SC_REALLOC_K); /* ceil je ZELO pomemben, če je chunk 1 recimo */ \ } while (0); -#define SC_OPT_TYPE unsigned char -#define SC_OPT_IMAGE (1 << 0) #define SC_STR(x) #x +enum sc_opt { + SC_ENGINE_GOOGLE = 1 << 0, + SC_LOG_ERROR = 1 << 1, + SC_LOG_WARNING = 1 << 2, + SC_LOG_INFO = 1 << 3, + SC_LOG_DEBUG = 1 << 4, + SC_OPT_IMAGE = 1 << 5 +}; +#define SC_LOG_MASK (SC_LOG_ERROR | SC_LOG_WARNING | SC_LOG_INFO | SC_LOG_DEBUG) +#define SC_OPT_TYPE enum sc_opt +#define SC_OPT_INIT 0 +#define SC_OPT_COMPAR /* mask */ (/* SC_ENGINE_GOOGLE | */ /* any engine is okay */ SC_OPT_IMAGE) +#define SC_COMPAR_CAST (int (*)(const void *, const void *)) #ifdef SC_LOGMEM struct sc_logentry { unsigned char type; /* SC_LOG_ERROR, SC_LOG_WARNING, SC_LOG_INFO, SC_LOG_DEBUG */ @@ -32,6 +38,19 @@ struct sc_logentry { int sc_logentry_free (struct sc_logentry * l); /* defined in log.c */ struct sc_logentry * sc_logentry_init (); /* defined in log.c */ #endif +struct sc_cache { +#ifdef SC_OLD_STORAGE + SC_IN_STRUCT_ARRAY(struct sc_query, queries); /* yesfree */ +#else + void * qrp; /* queries root pointer-tsearch(3) */ +#endif + pthread_rwlock_t * queries_lock; +#ifdef SC_LOGMEM + SC_IN_STRUCT_ARRAY(struct sc_logentry, logentries); /* yesfree */ + pthread_rwlock_t * logentries_lock; +#endif +}; +int sc_push_log (unsigned char t, struct sc_cache * c, const char * ca, char * f, size_t l, unsigned short int isf, char * m, ...); struct sc_result { struct sc_query * query; /* nofree - free from sc_cache */ char * url; /* yesfree - url of referer page when image searching */ @@ -65,8 +84,7 @@ struct sc_query { SC_IN_STRUCT_ARRAY(struct sc_result, results); /* yesfree */ char * string; /* yesfree - query string, stripped of any excess characters that should be excluded from indexing */ time_t lookup_time; /* time of last lookup */ - unsigned char engines; /* with what engine(s) was the query done - bitmask - if there are results from multiple engines */ - SC_OPT_TYPE opt; /* some options */ + SC_OPT_TYPE opt; /* some options including engines */ }; struct sc_query * sc_query_init () { struct sc_query * q = calloc(1, sizeof(struct sc_query)); @@ -77,31 +95,60 @@ struct sc_query * sc_query_init () { q->results[i]->query = q; } q->string = NULL; + q->opt = SC_OPT_INIT; return q; } -int sc_query_free (struct sc_query * q) { +#ifdef SC_OLD_STORAGE +int +#else +void +#endif +sc_query_free ( +#ifdef SC_OLD_STORAGE + struct sc_query +#else + void +#endif + * i) { + struct sc_query * q = +#ifndef SC_OLD_STORAGE + (struct sc_query *) +#endif + i; if (!q) - return -1; + return +#ifdef SC_OLD_STORAGE + -1 +#endif + ; + if (q->cache) + SC_LOG(SC_LOG_DEBUG, q->cache, "sc_query_free: %s", q->string ? q->string : "NULL"); free(q->string); /* if they were not alloced, they are NULL, if they were free'd somewhere else, they are also set to NULL */ for (size_t i = 0; i < q->results_sizeof; i++) sc_result_free(q->results[i]); free(q->results); free(q); - return 1; -} -struct sc_cache { - SC_IN_STRUCT_ARRAY(struct sc_query, queries); /* yesfree */ - pthread_rwlock_t * queries_lock; -#ifdef SC_LOGMEM - SC_IN_STRUCT_ARRAY(struct sc_logentry, logentries); /* yesfree */ - pthread_rwlock_t * logentries_lock; + return +#ifdef SC_OLD_STORAGE + 1 #endif -}; + ; +} +int sc_query_compar (const struct sc_query * a, const struct sc_query * b) { +#define SC_QUERY_COMPAR_OPT ->opt & SC_OPT_COMPAR + int r = (a SC_QUERY_COMPAR_OPT) < (b SC_QUERY_COMPAR_OPT) ? -1 + : (a SC_QUERY_COMPAR_OPT) > (b SC_QUERY_COMPAR_OPT) ? 1 : 0; + if (r) /* first we compare options, because it should be faster than query strings */ + return r; /* yeah, I know, useless optimizations */ + return strcmp(a->string, b->string); +} struct sc_cache * sc_cache_init() { #define SC_CILI(name) do { name##_lock = malloc(sizeof(pthread_rwlock_t)); pthread_rwlock_init(name##_lock, NULL); } while (0) struct sc_cache * c = calloc(1, sizeof(struct sc_cache)); +#ifdef SC_OLD_STORAGE c->queries_sizeof = SC_ALLOC_CHUNK; c->queries = calloc(c->queries_sizeof, sizeof(struct sc_query *)); +#endif #ifdef SC_LOGMEM c->logentries_sizeof = SC_ALLOC_CHUNK; c->logentries = calloc(c->logentries_sizeof, sizeof(struct sc_logentry *)); @@ -116,10 +163,14 @@ int sc_cache_free(struct sc_cache * c) { #define SC_CFLD(name) do { pthread_rwlock_destroy(name##_lock); free(name##_lock); } while(0) if (!c) return -1; +#ifdef SC_OLD_STORAGE fprintf(stderr, "c->queries_sizeof = %zu\n", c->queries_sizeof); for (size_t i = 0; i < c->queries_sizeof; i++) sc_query_free(c->queries[i]); free(c->queries); +#else + tdestroy(c->qrp, sc_query_free); +#endif #ifdef SC_LOGMEM for (size_t i = 0; i < c->logentries_sizeof; i++) sc_logentry_free(c->logentries[i]); diff --git a/test/tsearch.c b/test/tsearch.c new file mode 100644 index 0000000..45c4eef --- /dev/null +++ b/test/tsearch.c @@ -0,0 +1,15 @@ +#include +#include +#include +#define CAST (int (*) (const void *, const void *)) +int compar (const char * a, const char * b) { + return strcmp(a, b); +} +int main () { + void * root = NULL; + tsearch("key1", &root, CAST compar); + tsearch("key1", &root, CAST compar); + tsearch("key2", &root, CAST compar); + fprintf(stdout, "this should say key1: %s\n", *(char **) tfind("key1", &root, CAST compar)); + return 0; +} -- cgit v1.2.3