{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "7e32a042", "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "......\n", "fixed bad single file torrent 4f269d8aefd647ee270842d53ec98aebd23a4afe\n", "fixed bad single file torrent 7b09ae0b612dafc1744562dccbbe4becf4d633c3\n", "47843 @ 78.78622500100755 s\n" ] } ], "source": [ "from time import monotonic\n", "from sys import path\n", "from os import getenv\n", "path.append(getenv(\"HOME\") + \"/projects/travnik\")\n", "from travnik import glob\n", "print(\"......\")\n", "start = monotonic()\n", "torrents = glob(\"/var/opt/travnik\")\n", "print(len(torrents), \"@\", monotonic()-start, \"s\")\n", "# t = Torrent()\n", "# t.file(\"/root/projects/travnik/449a38ef7e042bd2d75e8921aa02f6f244165d9d.torrent\")\n", "# print(t.sha1.hex())\n", "# for path, length in t.paths():\n", "# print(path, length)\n", "# print(t)" ] }, { "cell_type": "code", "execution_count": 7, "id": "978ab1cf", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "...\n" ] } ], "source": [ "from travnik import Type\n", "for hash, torrent in torrents.items():\n", " if torrent.type == Type.HYBRID and not torrent.dict.get(b'info').get(b'meta version'):\n", " print(torrent.sha1.hex(), torrent.sha256.hex())\n", "print(\"...\")" ] }, { "cell_type": "code", "execution_count": null, "id": "a4419e5e", "metadata": {}, "outputs": [], "source": [ "s = monotonic()\n", "prej = None\n", "skup = 0\n", "dat = 0\n", "vel = 0\n", "for torrent in sorted([torrent for sha1, torrent in torrents.items()], key=lambda x:x.dict.get(b'creation date')):\n", " č = torrent.dict.get(b'creation date')\n", " dat += sum(1 for path, size in torrent.paths())\n", " vel += sum(size for path, size in torrent.paths())\n", " if not prej:\n", " prej = č\n", " continue\n", " if prej + 60*10 > č:\n", " skup += č-prej\n", " prej = č\n", "print(monotonic()-s, \"torrenti so se zbirali\", skup/86400, \"dni. en torrent je bil najden v povprečju na\", skup/len(torrents), \"sekund, v\", len(torrents), \"so metapodatki\", dat, \"datotek\", \"v skupni velikosti\", vel/(1024**4), \"TiB\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e170de45", "metadata": { "scrolled": false }, "outputs": [], "source": [ "s = monotonic()\n", "def uas(normalize=True, minrepr=0):\n", " odjemalci = {}\n", " for sha1, torrent in torrents.items():\n", " odjemalec = torrent.dict.get(b'source').get(b'v')\n", " if normalize and odjemalec is not None:\n", " if b'/' in odjemalec:\n", " odjemalec = odjemalec.split(b'/')[0]\n", " elif b' (' in odjemalec:\n", " odjemalec = odjemalec.split(b' (')[0]\n", " else:\n", " odjemalec = odjemalec.split(b' ')[0]\n", " odjemalec = odjemalec.replace(b'\\xc2\\xb5', b'\\xce\\xbc').decode()\n", " if odjemalec not in odjemalci.keys():\n", " odjemalci[odjemalec] = 1\n", " else:\n", " odjemalci[odjemalec] += 1\n", " trueodj = {\"ostali\": 0}\n", " count = 0\n", " for key, value in odjemalci.items():\n", " count += 1\n", " if value < minrepr:\n", " trueodj[\"ostali\"] += value\n", " else:\n", " trueodj[key] = value\n", " trueodj = [(v, k) for k, v in trueodj.items()]\n", " return trueodj, count\n", "odjemalci, count = uas(True, minrepr=0.01*len(torrents))\n", "odjemalci = sorted(odjemalci, reverse=False)\n", "from matplotlib import pyplot\n", "%matplotlib notebook\n", "fig, axes = pyplot.subplots()\n", "from math import log\n", "# axes.pie([log(sights) if sights else 0 for sights, name in odjemalci], labels=[name for sights, name in odjemalci])\n", "axes.barh([name if name is not None else \"neznan\" for sights, name in odjemalci], [sights for sights, name in odjemalci])\n", "axes.set_title(\"log skala odjemalcev\")\n", "pyplot.xscale(\"log\")\n", "fig.show()\n", "print(monotonic()-s, \"za\", count, \"različnih odjemalcev\")" ] }, { "cell_type": "code", "execution_count": null, "id": "52de34d6", "metadata": { "scrolled": true }, "outputs": [], "source": [ "s = monotonic()\n", "keys = {}\n", "for sha1, torrent in torrents.items():\n", " for key in torrent.dict.get(b'info').keys():\n", " if key.decode() not in keys.keys():\n", " value = torrent.dict.get(b'info').get(key)\n", " if type(value) is bytes:\n", " try:\n", " value = value.decode()\n", " except UnicodeDecodeError:\n", " pass\n", " keys[key.decode()] = [1, value, sha1.hex()]\n", " else:\n", " keys[key.decode()][0] += 1\n", "sort = sorted(keys, key=lambda x: keys[x][0])\n", "print(monotonic()-s, \"s\", len(keys))\n", "%matplotlib notebook\n", "fig, ax = pyplot.subplots();\n", "ax.barh(sort, [keys[x][0] for x in sort])\n", "pyplot.xscale(\"log\")\n", "pyplot.xlabel(\"število pojavitev ključa v slovarju info\")\n", "fig.show() ## TODO komentiraj\n", "for i in sort:\n", " print(i, keys[i])" ] }, { "cell_type": "code", "execution_count": null, "id": "fea0f2b6", "metadata": { "scrolled": true }, "outputs": [], "source": [ "s = monotonic()\n", "def removeminorities(population, minrepr=0, ostalo=\"ostalo\"):\n", " true = {ostalo: 0}\n", " for key, value in population.items():\n", " if value < minrepr:\n", " true[ostalo] += value\n", " else:\n", " true[key] = value\n", " return true\n", "def sources():\n", " sources = {}\n", " for sha1, torrent in torrents.items():\n", " source = torrent.dict.get(b'info').get(b'source')\n", " if source is None:\n", " source = torrent.dict.get(b'info').get(b'publisher')\n", " if source is None:\n", " source = torrent.dict.get(b'info').get(b'publisher-url')\n", " if source is None:\n", " source = torrent.dict.get(b'info').get(b'comment')\n", " try:\n", " if type(source) is bytes:\n", " source = source.decode().strip()\n", " except UnicodeDecodeError:\n", " pass\n", " if source not in sources.keys():\n", " sources[source] = 1\n", " else:\n", " sources[source] += 1\n", " return sources\n", "sources = sources()\n", "sources = removeminorities(sources, len(sources)*0, \"ostali\")\n", "sort = sorted(sources, reverse=True, key=lambda x:sources[x])\n", "sort.remove(None)\n", "print(monotonic()-s, \"s\", sources[None]/len(torrents)*100, \"brez ključa source, publisher, publisher-url ali comment\", len(sources), \"virov\")\n", "%matplotlib notebook\n", "fig, ax = pyplot.subplots();\n", "ax.barh([str(x) for x in sort], [sources[x] for x in sort])\n", "pyplot.xscale(\"log\")\n", "pyplot.xlabel(\"število pojavitev distributerja\")\n", "fig.show() ## TODO komentiraj\n", "from tabulate import tabulate\n", "tabulate([[x, sources[x]] for x in sort], tablefmt=\"html\")\n", "for x in sort:\n", " print(sources[x], \"\\t\", x)" ] }, { "cell_type": "code", "execution_count": null, "id": "4bd1f517", "metadata": {}, "outputs": [], "source": [ "s = monotonic()\n", "from mimetypes import guess_type\n", "def ext(mime=False, minreprratio=0):\n", " bycount = {}\n", " bysize = {}\n", " bysizerepresentative = {}\n", " filescount = 0\n", " bytescount = 0\n", " for sha1, torrent in torrents.items():\n", " try:\n", " representatives = {}\n", " for path, size in torrent.paths():\n", " filescount += 1\n", " bytescount += size\n", " if mime:\n", " ext = guess_type(path.pop().decode(encoding=\"iso-8859-2\"))[0]\n", " else:\n", " ext = path.pop().split(b'.').pop().decode(encoding=\"iso-8859-2\").lower()\n", " if ext not in bycount.keys():\n", " bycount[ext] = 1\n", " else:\n", " bycount[ext] += 1\n", " if ext not in bysize.keys():\n", " bysize[ext] = size\n", " else:\n", " bysize[ext] += size\n", " if ext not in representatives.keys():\n", " representatives[ext] = size\n", " else:\n", " representatives[ext] += size\n", " except AttributeError:\n", " print(sha1.hex(), torrent)\n", " raise AttributeError\n", " try:\n", " representative = sorted(representatives, key=lambda x:representatives[x]).pop()\n", " except IndexError:\n", " print(sha1.hex(), torrent)\n", " raise IndexError\n", " if representative not in bysizerepresentative.keys():\n", " bysizerepresentative[representative] = 1\n", " else:\n", " bysizerepresentative[representative] += 1\n", " truebycount = removeminorities(bycount, minreprratio*filescount, \"ostale\")\n", " truebysize = removeminorities(bysize, minreprratio*bytescount, \"ostale\")\n", " truebysizerepresentative = removeminorities(bysizerepresentative, minreprratio*len(torrents), \"ostale\")\n", " for data in [truebycount, truebysize, truebysizerepresentative]:\n", " data = [(v, k) for k, v in data.items()]\n", " return truebycount, truebysize, truebysizerepresentative, len(bycount), filescount, bytescount\n", "print(\"...\")\n", "bycount, bysize, bysizerepresentative, kinds, filescount, bytescount = ext(False, 0.001)\n", "print(monotonic()-s, \"s\", kinds, \"različnih tipov v\", filescount, \"datotekah in\", bytescount/(1024**4), \"TiB\")" ] }, { "cell_type": "code", "execution_count": null, "id": "82ab922a", "metadata": {}, "outputs": [], "source": [ "sortcount = sorted(bycount, reverse=False, key=lambda x: bycount[x])\n", "sortsize = sorted(bysize, reverse=False, key=lambda x: bysize[x])\n", "sortsizerepresentative = sorted(bysizerepresentative, reverse=False, key=lambda x: bysizerepresentative[x])\n", "from matplotlib import pyplot\n", "%matplotlib notebook\n", "for desc, data in {\"po številu datotek\": (sortcount, bycount), \"po velikosti datotek\": (sortsize, bysize), \"po številu po velikosti največjih datotek torrentov\": (sortsizerepresentative, bysizerepresentative)}.items():\n", " fig, axes = pyplot.subplots()\n", " # axes.pie([data[1][key] for key in data[0]], labels=data[0])\n", " axes.barh(data[0], [data[1][key] for key in data[0]])\n", " pyplot.xscale(\"log\")\n", " axes.set_title(desc)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "fca757e3", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" } }, "nbformat": 4, "nbformat_minor": 5 }