summaryrefslogblamecommitdiffstats
path: root/analiza/zvezek.ipynb
blob: bf864bd33aaebf20f28da6753ce91ac354de4cea (plain) (tree)
1
2
3
4
5
6
7
8



                       
                        

                    
                     






                             

                                                                                 
                                     



              
                                   
                             

                                                              
                                 

                            
                                              










                                                                                              




                        




                             
             



              













                                                                                                   



















                                                                                                                                                                                                                                                             
                           



                     
                 

                        
                                            














                                                                                         









                                                           
                                                                 
                                                     


                                      




                                                                                                                                    

                                                                    



                       
                           
                    
                

                    
                 




























                                                                      
                           


                    
     
                 

                        




                                                                        
                      

                                      







                                                                             

                                                                       










                                                         
                                                                        

                                                                      
                                                                                                                                                             





                                                                    
                                      


                                                                     



                       
                           
                    
                  
                 

                        















































                                                                                                                      
                                                                                                 
                                                                                                                             



                       
                           

                    
                 













                                                                                                                                                                                                                                            







                           

















                                          
                       




                    
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "7e32a042",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "......\n",
      "fixed bad single file torrent 4f269d8aefd647ee270842d53ec98aebd23a4afe\n",
      "fixed bad single file torrent 7b09ae0b612dafc1744562dccbbe4becf4d633c3\n",
      "47843 @ 78.78622500100755 s\n"
     ]
    }
   ],
   "source": [
    "from time import monotonic\n",
    "from sys import path\n",
    "from os import getenv\n",
    "path.append(getenv(\"HOME\") + \"/projects/travnik\")\n",
    "from travnik import glob\n",
    "print(\"......\")\n",
    "start = monotonic()\n",
    "torrents = glob(\"/var/opt/travnik\")\n",
    "print(len(torrents), \"@\", monotonic()-start, \"s\")\n",
    "# t = Torrent()\n",
    "# t.file(\"/root/projects/travnik/449a38ef7e042bd2d75e8921aa02f6f244165d9d.torrent\")\n",
    "# print(t.sha1.hex())\n",
    "# for path, length in t.paths():\n",
    "#     print(path, length)\n",
    "# print(t)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "978ab1cf",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "...\n"
     ]
    }
   ],
   "source": [
    "from travnik import Type\n",
    "for hash, torrent in torrents.items():\n",
    "    if torrent.type == Type.HYBRID and not torrent.dict.get(b'info').get(b'meta version'):\n",
    "        print(torrent.sha1.hex(), torrent.sha256.hex())\n",
    "print(\"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4419e5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "s = monotonic()\n",
    "prej = None\n",
    "skup = 0\n",
    "dat = 0\n",
    "vel = 0\n",
    "for torrent in sorted([torrent for sha1, torrent in torrents.items()], key=lambda x:x.dict.get(b'creation date')):\n",
    "    č = torrent.dict.get(b'creation date')\n",
    "    dat += sum(1 for path, size in torrent.paths())\n",
    "    vel += sum(size for path, size in torrent.paths())\n",
    "    if not prej:\n",
    "        prej = č\n",
    "        continue\n",
    "    if prej + 60*10 > č:\n",
    "        skup += č-prej\n",
    "    prej = č\n",
    "print(monotonic()-s, \"torrenti so se zbirali\", skup/86400, \"dni. en torrent je bil najden v povprečju na\", skup/len(torrents), \"sekund, v\", len(torrents), \"so metapodatki\", dat, \"datotek\", \"v skupni velikosti\", vel/(1024**4), \"TiB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e170de45",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "s = monotonic()\n",
    "def uas(normalize=True, minrepr=0):\n",
    "    odjemalci = {}\n",
    "    for sha1, torrent in torrents.items():\n",
    "        odjemalec = torrent.dict.get(b'source').get(b'v')\n",
    "        if normalize and odjemalec is not None:\n",
    "            if b'/' in odjemalec:\n",
    "                odjemalec = odjemalec.split(b'/')[0]\n",
    "            elif b' (' in odjemalec:\n",
    "                odjemalec = odjemalec.split(b' (')[0]\n",
    "            else:\n",
    "                odjemalec = odjemalec.split(b' ')[0]\n",
    "            odjemalec = odjemalec.replace(b'\\xc2\\xb5', b'\\xce\\xbc').decode()\n",
    "        if odjemalec not in odjemalci.keys():\n",
    "            odjemalci[odjemalec] = 1\n",
    "        else:\n",
    "            odjemalci[odjemalec] += 1\n",
    "    trueodj = {\"ostali\": 0}\n",
    "    count = 0\n",
    "    for key, value in odjemalci.items():\n",
    "        count += 1\n",
    "        if value < minrepr:\n",
    "            trueodj[\"ostali\"] += value\n",
    "        else:\n",
    "            trueodj[key] = value\n",
    "    trueodj = [(v, k) for k, v in trueodj.items()]\n",
    "    return trueodj, count\n",
    "odjemalci, count = uas(True, minrepr=0.01*len(torrents))\n",
    "odjemalci = sorted(odjemalci, reverse=False)\n",
    "from matplotlib import pyplot\n",
    "%matplotlib notebook\n",
    "fig, axes = pyplot.subplots()\n",
    "from math import log\n",
    "# axes.pie([log(sights) if sights else 0 for sights, name in odjemalci], labels=[name for sights, name in odjemalci])\n",
    "axes.barh([name if name is not None else \"neznan\" for sights, name in odjemalci], [sights for sights, name in odjemalci])\n",
    "axes.set_title(\"log skala odjemalcev\")\n",
    "pyplot.xscale(\"log\")\n",
    "fig.show()\n",
    "print(monotonic()-s, \"za\", count, \"različnih odjemalcev\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52de34d6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "s = monotonic()\n",
    "keys = {}\n",
    "for sha1, torrent in torrents.items():\n",
    "    for key in torrent.dict.get(b'info').keys():\n",
    "        if key.decode() not in keys.keys():\n",
    "            value = torrent.dict.get(b'info').get(key)\n",
    "            if type(value) is bytes:\n",
    "                try:\n",
    "                    value = value.decode()\n",
    "                except UnicodeDecodeError:\n",
    "                    pass\n",
    "            keys[key.decode()] = [1, value, sha1.hex()]\n",
    "        else:\n",
    "            keys[key.decode()][0] += 1\n",
    "sort = sorted(keys, key=lambda x: keys[x][0])\n",
    "print(monotonic()-s, \"s\", len(keys))\n",
    "%matplotlib notebook\n",
    "fig, ax = pyplot.subplots();\n",
    "ax.barh(sort, [keys[x][0] for x in sort])\n",
    "pyplot.xscale(\"log\")\n",
    "pyplot.xlabel(\"število pojavitev ključa v slovarju info\")\n",
    "fig.show() ## TODO komentiraj\n",
    "for i in sort:\n",
    "    print(i, keys[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fea0f2b6",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "s = monotonic()\n",
    "def removeminorities(population, minrepr=0, ostalo=\"ostalo\"):\n",
    "    true = {ostalo: 0}\n",
    "    for key, value in population.items():\n",
    "        if value < minrepr:\n",
    "            true[ostalo] += value\n",
    "        else:\n",
    "            true[key] = value\n",
    "    return true\n",
    "def sources():\n",
    "    sources = {}\n",
    "    for sha1, torrent in torrents.items():\n",
    "        source = torrent.dict.get(b'info').get(b'source')\n",
    "        if source is None:\n",
    "            source = torrent.dict.get(b'info').get(b'publisher')\n",
    "        if source is None:\n",
    "            source = torrent.dict.get(b'info').get(b'publisher-url')\n",
    "        if source is None:\n",
    "            source = torrent.dict.get(b'info').get(b'comment')\n",
    "        try:\n",
    "            if type(source) is bytes:\n",
    "                source = source.decode().strip()\n",
    "        except UnicodeDecodeError:\n",
    "            pass\n",
    "        if source not in sources.keys():\n",
    "            sources[source] = 1\n",
    "        else:\n",
    "            sources[source] += 1\n",
    "    return sources\n",
    "sources = sources()\n",
    "sources = removeminorities(sources, len(sources)*0, \"ostali\")\n",
    "sort = sorted(sources, reverse=True, key=lambda x:sources[x])\n",
    "sort.remove(None)\n",
    "print(monotonic()-s, \"s\", sources[None]/len(torrents)*100, \"brez ključa source, publisher, publisher-url ali comment\", len(sources), \"virov\")\n",
    "%matplotlib notebook\n",
    "fig, ax = pyplot.subplots();\n",
    "ax.barh([str(x) for x in sort], [sources[x] for x in sort])\n",
    "pyplot.xscale(\"log\")\n",
    "pyplot.xlabel(\"število pojavitev distributerja\")\n",
    "fig.show() ## TODO komentiraj\n",
    "from tabulate import tabulate\n",
    "tabulate([[x, sources[x]] for x in sort], tablefmt=\"html\")\n",
    "for x in sort:\n",
    "    print(sources[x], \"\\t\", x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4bd1f517",
   "metadata": {},
   "outputs": [],
   "source": [
    "s = monotonic()\n",
    "from mimetypes import guess_type\n",
    "def ext(mime=False, minreprratio=0):\n",
    "    bycount = {}\n",
    "    bysize = {}\n",
    "    bysizerepresentative = {}\n",
    "    filescount = 0\n",
    "    bytescount = 0\n",
    "    for sha1, torrent in torrents.items():\n",
    "        try:\n",
    "            representatives = {}\n",
    "            for path, size in torrent.paths():\n",
    "                filescount += 1\n",
    "                bytescount += size\n",
    "                if mime:\n",
    "                    ext = guess_type(path.pop().decode(encoding=\"iso-8859-2\"))[0]\n",
    "                else:\n",
    "                    ext = path.pop().split(b'.').pop().decode(encoding=\"iso-8859-2\").lower()\n",
    "                if ext not in bycount.keys():\n",
    "                    bycount[ext] = 1\n",
    "                else:\n",
    "                    bycount[ext] += 1\n",
    "                if ext not in bysize.keys():\n",
    "                    bysize[ext] = size\n",
    "                else:\n",
    "                    bysize[ext] += size\n",
    "                if ext not in representatives.keys():\n",
    "                    representatives[ext] = size\n",
    "                else:\n",
    "                    representatives[ext] += size\n",
    "        except AttributeError:\n",
    "            print(sha1.hex(), torrent)\n",
    "            raise AttributeError\n",
    "        try:\n",
    "            representative = sorted(representatives, key=lambda x:representatives[x]).pop()\n",
    "        except IndexError:\n",
    "            print(sha1.hex(), torrent)\n",
    "            raise IndexError\n",
    "        if representative not in bysizerepresentative.keys():\n",
    "            bysizerepresentative[representative] = 1\n",
    "        else:\n",
    "            bysizerepresentative[representative] += 1\n",
    "    truebycount = removeminorities(bycount, minreprratio*filescount, \"ostale\")\n",
    "    truebysize = removeminorities(bysize, minreprratio*bytescount, \"ostale\")\n",
    "    truebysizerepresentative = removeminorities(bysizerepresentative, minreprratio*len(torrents), \"ostale\")\n",
    "    for data in [truebycount, truebysize, truebysizerepresentative]:\n",
    "        data = [(v, k) for k, v in data.items()]\n",
    "    return truebycount, truebysize, truebysizerepresentative, len(bycount), filescount, bytescount\n",
    "print(\"...\")\n",
    "bycount, bysize, bysizerepresentative, kinds, filescount, bytescount = ext(False, 0.001)\n",
    "print(monotonic()-s, \"s\", kinds, \"različnih tipov v\", filescount, \"datotekah in\", bytescount/(1024**4), \"TiB\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "82ab922a",
   "metadata": {},
   "outputs": [],
   "source": [
    "sortcount = sorted(bycount, reverse=False, key=lambda x: bycount[x])\n",
    "sortsize = sorted(bysize, reverse=False, key=lambda x: bysize[x])\n",
    "sortsizerepresentative = sorted(bysizerepresentative, reverse=False, key=lambda x: bysizerepresentative[x])\n",
    "from matplotlib import pyplot\n",
    "%matplotlib notebook\n",
    "for desc, data in {\"po številu datotek\": (sortcount, bycount), \"po velikosti datotek\": (sortsize, bysize), \"po številu po velikosti največjih datotek torrentov\": (sortsizerepresentative, bysizerepresentative)}.items():\n",
    "    fig, axes = pyplot.subplots()\n",
    "    # axes.pie([data[1][key] for key in data[0]], labels=data[0])\n",
    "    axes.barh(data[0], [data[1][key] for key in data[0]])\n",
    "    pyplot.xscale(\"log\")\n",
    "    axes.set_title(desc)\n",
    "    fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fca757e3",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}