Source code for k1lib.cli.modifier

# AUTOGENERATED FILE! PLEASE DON'T EDIT
"""
This is for quick modifiers, think of them as changing formats
"""
__all__ = ["applyS", "aS", "apply", "map_", "applyMp", "parallel", "applyCl",
           "applyTh", "applySerial",
           "sort", "sortF", "consume", "randomize", "stagger", "op",
           "integrate"]
from typing import Callable, Iterator, Any, Union, List, Tuple
from k1lib.cli.init import patchDefaultDelim, BaseCli, T, fastF
import k1lib.cli as cli, numpy as np, threading, gc; import k1lib
from collections import deque
from functools import partial, update_wrapper, lru_cache
from k1lib.cli.typehint import *
import dill, pickle, k1lib, warnings, atexit, signal, time, os, random
try: import torch; import torch.multiprocessing as mp; hasTorch = True
except: import multiprocessing as mp; hasTorch = False
ray = k1lib.dep("ray")
settings = k1lib.settings.cli
[docs]class applyS(BaseCli):
[docs] def __init__(self, f:Callable[[T], T], *args, **kwargs): """Like :class:`apply`, but much simpler, just operating on the entire input object, essentially. The "S" stands for "single". There's also an alias shorthand for this called :class:`aS`. Example:: # returns 5 3 | aS(lambda x: x+2) Like :class:`apply`, you can also use this as a decorator like this:: @aS def f(x): return x+2 # returns 5 3 | f This also decorates the returned object so that it has same qualname, docstring and whatnot. .. admonition:: Shorthands Writing out "lambda x:" all the time is annoying, and there are ways to quickly say ``lambda x: x+2`` like so:: 3 | op()+2 # returns 5 3 | aS("x+2") # returns 5. Behind the scenes, it compiles and execute `lambda x: x+2` The first way is to use :class:`op`, that will absorb all operations done on it, like "+", and returns a function that essentially replays all the operations. In the second way, you only have to pass in the string containing code that you want done on the variable "x". Then internally, it will compile to regular Python code. In fact, you can pass in ``op()`` or just a string to any cli that accepts any kind of function, like :class:`~k1lib.cli.filt.filt` or :class:`apply`:: range(4) | apply("x-2") | deref() range(4) | apply(op()-2) | deref() range(4) | filt("x%2") | deref() range(4) | filt(op()%2) | deref() :param f: the function to be executed :param kwargs: other keyword arguments to pass to the function, together with ``args``""" super().__init__(fs=[f]); self.args = args; self.kwargs = kwargs self.f = f; self._fC = fastF(f); update_wrapper(self, f, updated=())
def _typehint(self, inp): if self.hasHint: return self._hint try: return self.f._typehint(inp) except: return tAny()
[docs] def __ror__(self, it:T) -> T: return self._fC(it, *self.args, **self.kwargs)
[docs] def __invert__(self): """Configures it so that it expand the arguments out. Example:: # returns 5 [2, 3] | ~aS(lambda x, y: x + y) def f(x, y, a=4): return x*y + a # returns 10 [2, 3] | ~aS(f) # returns 11 [2, 3] | ~aS(f, a=5)""" f = self.f; a = self.args; kw = self.kwargs; return applyS(lambda x: f(*x, *a, **kw));
aS = applyS
[docs]class apply(BaseCli):
[docs] def __init__(self, f:Callable[[T], T], column:Union[int, List[int]]=None, cache:int=0, **kwargs): """Applies a function f to every element in the incoming list/iterator. Example:: # returns [0, 1, 4, 9, 16] range(5) | apply(lambda x: x**2) | deref() # returns [[3.0, 1.0, 1.0], [3.0, 1.0, 1.0]], running the function on the 0th column torch.ones(2, 3) | apply(lambda x: x+2, 0) | deref() # returns [[0, -1, 2, 3, -4], [2, -3, 4, 5, -6], [0, -1, 4, 9, -16]], running the function on the 1st (0-index btw) and 4th columns [[0, 1, 2, 3, 4], [2, 3, 4, 5, 6], [0, 1, 4, 9, 16]] | apply(lambda x: -x, [1, 4]) | deref() You can also use this as a decorator, like this:: @apply def f(x): return x**2 # returns [0, 1, 4, 9, 16] range(5) | f | deref() You can also add a cache, like this:: def calc(i): time.sleep(0.5); return i**2 # takes 2.5s range(5) | repeatFrom(2) | apply(calc, cache=10) | deref() # takes 5s range(5) | repeatFrom(2) | apply(calc) | deref() You can add custom keyword arguments into the function:: def f(x, y, z=3): return x + y + z # returns [15, 17, 19, 21, 23] [range(5), range(10, 15)] | transpose() | ~apply(f, z=5) | deref() If "apply" is too hard to remember, this cli also has an alias :class:`map_` that kinda mimics Python's ``map()``. Also slight reminder that you can't pass in extra positional args like in :class:`aS`, just extra keyword arguments. :param column: if not None, then applies the function to that column or columns only :param cache: if specified, then caches this much number of values :param kwargs: extra keyword arguments to pass in the function""" super().__init__(fs=[f]); self.f = f; self.kwargs = kwargs if column: ex = Exception(f"Applying a function on a negative-indexed column ({column}) is not supported") if isinstance(column, int): if column < 0: raise ex else: column = list(column) if len([c for c in column if c < 0]): raise ex self.column = column; self.cache = cache; self._fC = fastF(f) if cache > 0: self._fC = lru_cache(cache)(self._fC) self.normal = self.column is None and self.cache == 0 # cached value to say that this apply is just being used as a wrapper, nothing out of the ordinary
def _typehint(self, inp): if self.column is None: if isinstance(inp, tListIterSet): try: return tIter(self.f._typehint(inp.child)) except: return tIter(tAny()) return super()._typehint(inp) def _copy(self): return apply(self.f, self.column, self.cache, **self.kwargs) # ~apply() case handled automatically
[docs] def __ror__(self, it:Iterator[str]): c = self.column; f = self._fC; kwargs = self.kwargs if c is None: return (f(line, **kwargs) for line in it) else: if isinstance(c, int): return ([(e if i != c else f(e, **kwargs)) for i, e in enumerate(row)] for row in it) else: ops = [] for c_ in c: a = self._copy(); a.column = c_; ops.append(a) return it | cli.serial(*ops)
[docs] def __invert__(self): """Same mechanism as in :class:`applyS`, it expands the arguments out. Just for convenience really. Example:: # returns [10, 12, 14, 16, 18] [range(5), range(10, 15)] | transpose() | ~apply(lambda x, y: x+y) | deref()""" return apply(lambda x: self.f(*x, **self.kwargs), self.column, self.cache)
map_ = apply def executeFunc(common, line): import dill, time; f, kwargs = dill.loads(common) res = f(dill.loads(line), **kwargs) time.sleep(0.1); return res # suggestion by https://stackoverflow.com/questions/36359528/broken-pipe-error-with-multiprocessing-queue def terminateGraceful(): signal.signal(signal.SIGINT, signal.SIG_IGN) _k1_applyMp_global_ctx = {}; _k1_applyMp_global_ctx_autoInc = k1lib.AutoIncrement(prefix="_k1_applyMp")
[docs]class applyMp(BaseCli): _pools = set() _torchNumThreads = None
[docs] def __init__(self, f:Callable[[T], T], prefetch:int=None, timeout:float=8, utilization:float=0.8, bs:int=1, newPoolEvery:int=0, **kwargs): """Like :class:`apply`, but execute a function over the input iterator in multiple processes. Example:: # returns [3, 2] ["abc", "de"] | applyMp(len) | deref() # returns [5, 6, 9] range(3) | applyMp(lambda x, bias: x**2+bias, bias=5) | deref() # returns [[1, 2, 3], [1, 2, 3]], demonstrating outside vars work someList = [1, 2, 3] ["abc", "de"] | applyMp(lambda s: someList) | deref() Internally, this will continuously spawn new jobs up until 80% of all CPU cores are utilized. On posix systems, the default multiprocessing start method is ``fork()``. This sort of means that all the variables in memory will be copied over. On windows and macos, the default start method is ``spawn``, meaning each child process is a completely new interpreter, so you have to pass in all required variables and reimport every dependencies. Read more at https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods If you don't wish to schedule all jobs at once, you can specify a ``prefetch`` amount, and it will only schedule that much jobs ahead of time. Example:: range(10000) | applyMp(lambda x: x**2) | head() | deref() # 700ms range(10000) | applyMp(lambda x: x**2, 5) | head() | deref() # 300ms # demonstrating there're no huge penalties even if we want all results at the same time range(10000) | applyMp(lambda x: x**2) | deref() # 900ms range(10000) | applyMp(lambda x: x**2, 5) | deref() # 1000ms The first line will schedule all jobs at once, and thus will require more RAM and compute power, even though we discard most of the results anyway (the :class:`~k1lib.cli.filt.head` cli). The second line only schedules 5 jobs ahead of time, and thus will be extremely more efficient if you don't need all results right away. .. note:: Remember that every :class:`~k1lib.cli.init.BaseCli` is also a function, meaning that you can do stuff like:: # returns [['ab', 'ac']] [["ab", "cd", "ac"]] | applyMp(filt(op().startswith("a")) | deref()) | deref() Also remember that the return result of ``f`` should be serializable, meaning it should not be a generator. That's why in the example above, there's a ``deref()`` inside f. You should also convert PyTorch tensors into Numpy arrays Most of the time, you would probably want to specify ``bs`` to something bigger than 1 (may be 32 or sth like that). This will executes ``f`` multiple times in a single job, instead of executing ``f`` only once per job. Should reduce overhead of process creation dramatically. If you encounter strange errors not seen on :class:`apply`, you can try to clear all pools (using :meth:`clearPools`), to terminate all child processes and thus free resources. On earlier versions, you have to do this manually before exiting, but now :class:`applyMp` is much more robust. Also, you should not immediately assume that :class:`applyMp` will always be faster than :class:`apply`. Remember that :class:`applyMp` will create new processes, serialize and transfer data to them, execute it, then transfer data back. If your code transfers a lot of data back and forth (compared to the amount of computation done), or the child processes don't have a lot of stuff to do before returning, it may very well be a lot slower than :class:`apply`. There's a potential loophole here that can make your code faster. Because the main process is forked (at least on linux), every variable is still there, even the big ones. So, you can potentially do something like this:: bigData = [] # 1B items in the list # summing up all items together. No input data transfers (because it's forked instead) range(1_000_000_000) | batched(100) | applyMp(lambda r: r | apply(lambda i: bigData[i]) | toSum()) | toSum() In fact, I use this loophole all the time, and thus has made the function :meth:`shared`, so check it out. :param prefetch: if not specified, schedules all jobs at the same time. If specified, schedules jobs so that there'll only be a specified amount of jobs, and will only schedule more if results are actually being used. :param timeout: seconds to wait for job before raising an error :param utilization: how many percent cores are we running? 0 for no cores, 1 for all the cores. Defaulted to 0.8 :param bs: if specified, groups ``bs`` number of transforms into 1 job to be more efficient. :param kwargs: extra arguments to be passed to the function. ``args`` not included as there're a couple of options you can pass for this cli. :param newPoolEvery: creates a new processing pool for every specific amount of input fed. 0 for not refreshing any pools at all. Turn this on in case your process consumes lots of memory and you have to kill them eventually to free up some memory""" super().__init__(fs=[f]); self.f = fastF(f) self.prefetch = prefetch or int(1e9) self.timeout = timeout; self.utilization = utilization self.bs = bs; self.kwargs = kwargs; self.p = None self.newPoolEvery = newPoolEvery; self.ps = []; self._serializeF = True
[docs] def __ror__(self, it:Iterator[T]) -> Iterator[T]: timeout = self.timeout; it = iter(it); f = self.f # really make sure it's an iterator, for prefetch if self.bs > 1: return it | cli.batched(self.bs, True) | applyMp(apply(f) | cli.toList(), self.prefetch, timeout, **self.kwargs) | cli.joinStreams() def newPool(): if hasTorch: try: applyMp._torchNumThreads = applyMp._torchNumThreads or torch.get_num_threads(); torch.set_num_threads(1) except: pass # why do all of this? Because some strange interaction between PyTorch and multiprocessing, outlined here: https://github.com/pytorch/pytorch/issues/82843 os.environ["py_k1lib_in_applyMp"] = "True" self.p = mp.Pool(int(mp.cpu_count()*self.utilization), terminateGraceful); self.ps.append(self.p) if hasTorch and applyMp._torchNumThreads is not None: torch.set_num_threads(applyMp._torchNumThreads) def intercept(it, n): for i, e in enumerate(it): if i % n == 0: if self.p is not None: self.p.close(); self.ps.remove(self.p) gc.collect(); newPool() yield e common = dill.dumps([f, self.kwargs]) def gen(it): with k1lib.captureStdout(False, True) as out: try: if self.newPoolEvery > 0: it = intercept(it, self.newPoolEvery) else: newPool() fs = deque() for i, line in zip(range(self.prefetch), it): fs.append(self.p.apply_async(executeFunc, [common, dill.dumps(line)])) for line in it: yield fs.popleft().get(timeout) fs.append(self.p.apply_async(executeFunc, [common, dill.dumps(line)])) for f in fs: yield f.get(timeout) except KeyboardInterrupt as e: print("applyMp interrupted. Terminating pool now") for p in self.ps: p.close(); p.terminate(); raise e except Exception as e: print("applyMp encounter errors. Terminating pool now") for p in self.ps: p.close(); p.terminate(); raise e else: for p in self.ps: p.close(); p.terminate(); return gen(it)
[docs] @staticmethod def cat(fileName: str, f:Callable, n:int=None, rS=None, **kwargs): """Like :meth:`applyCl.cat`, this will split a file up into multiple sections, execute ``f`` over all sections and return the results. Example:: fn = "~/repos/labs/k1lib/k1lib/cli/test/applyMp.cat" "0123456789\\n"*100 | file(fn) # returns [6, 6, 6, 7, 6, 6, 6, 7, 6, 6, 6, 7, 6, 6, 6, 8] applyMp.cat(fn, shape(0), 16) | deref() :param f: function to execute on an iterator of lines :param n: how many chunks should it split the file into. Defaulted to the number of cpu cores available :param rS: :class:`~k1lib.cli.inp.refineSeek` instance, if you need more fine-grained control over section boundaries so as to not make everything corrupted :param kwargs: extra keyword arguments for :class:`applyMp`""" return fileName | cli.splitSeek(n or os.cpu_count()) | (rS or cli.iden()) | cli.window(2) | ~applyMp(lambda x,y: cli.cat(fileName, sB=x, eB=y) | f, **kwargs)
[docs] @staticmethod def shared(f, **kwargs): """Execution model where the input iterator is dereferenced and shared across all processes, bypassing serialization. Example:: a = range(1_000_000_000) | apply(lambda x: x*1.5 - 2000) | aS(list) # giant data structure a | batched(50_000_000, True) | applyMp(toSum()) | toSum() # has to serialize and deserialize lists of numbers, which wastes lots of cpu cycles and memory a | applyMp.shared(toSum()) | toSum() # giant data structure is forked, no serialization happens, no memory even gets copied, much faster In the 2nd line, most of the time is spent on serializing the data and transferring it to other processes, while in the 3rd line, most of the time is spent on calculating the sum instead, as the giant data structure is forked, and Linux doesn't copy it internally.""" def inner(it): try: n = len(it) except: it = list(it); n = len(it) # this is pretty unintuitive right? Why do it this way? Turns out, if you were to reference `it` directly, it will store it in f's co_freevars, # which will be serialized, defeating the purpose. Moving it to a global variable forces it to move to co_names instead, avoiding serialization. This took forever to understand idx = _k1_applyMp_global_ctx_autoInc(); _k1_applyMp_global_ctx[idx] = it res = range(n) | cli.batched(round(n/os.cpu_count()+1), True) | applyMp(lambda r: f(_k1_applyMp_global_ctx[idx][r.start:r.stop]), **kwargs) | aS(list) _k1_applyMp_global_ctx[idx] = None; return res return aS(inner)
def _copy(self): return applyMp(self.f, self.prefetch, self.timeout, self.utilization, self.bs, self.newPoolEvery, **self.kwargs)
[docs] def __invert__(self): """Expands the arguments out, just like :class:`apply`. Example:: # returns [20, 20, 18, 14, 8, 0, -10, -22, -36, -52] [range(10), range(20, 30)] | transpose() | ~applyMp(lambda x, y: y-x**2) | deref()""" res = self._copy(); f = res.f; res.f = lambda x: f(*x); return res
[docs] @staticmethod def clearPools(): """Terminate all existing pools. Do this before restarting/quitting the script/notebook to make sure all resources (like GPU) are freed. **Update**: you probably won't have to call this manually anymore since version 0.9, but if you run into problems, try doing this.""" for p in applyMp._pools: try: p.terminate() except: pass applyMp._pools = set()
[docs] @staticmethod def pools(): """Get set of all pools. Meant for debugging purposes only.""" return applyMp._pools
def __del__(self): return if hasattr(self, "p"): self.p.terminate(); if self.p in applyMp._pools: applyMp._pools.remove(self.p)
# apparently, this doesn't do anything, at least in jupyter environment atexit.register(lambda: applyMp.clearPools()) parallel = applyMp s = k1lib.Settings(); settings.add("applyCl", s, "modifier.applyCl() settings") s.add("sudoTimeout", 300, "seconds before deleting the stored password for sudo commands") _password = k1lib.Wrapper(None) def removePw(): while True: time.sleep(settings.applyCl.sudoTimeout); _password.value = None t = threading.Thread(target=removePw, daemon=True).start() def specificNode(obj, nodeId:str): return obj.options(scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(node_id=nodeId, soft=False))
[docs]class applyCl(BaseCli):
[docs] def __init__(self, f, prefetch=None, timeout=60, bs=1, rss:Union[dict, str]={}, pre:bool=False, orPatch=True, num_cpus=1, **kwargs): """Like :class:`apply`, but execute a function over the input iterator in multiple processes on multiple nodes inside of a cluster (hence "cl"). So, just a more powerful version of :class:`applyMp`, assuming you have a cluster to run it on. Example:: # returns [3, 2] ["abc", "de"] | applyCl(len) | deref() # returns [5, 6, 9] range(3) | applyCl(lambda x, bias: x**2+bias, bias=5) | deref() # returns [[1, 2, 3], [1, 2, 3]], demonstrating outside vars work someList = [1, 2, 3] ["abc", "de"] | applyCl(lambda s: someList) | deref() Internally, this uses the library Ray (https://www.ray.io) to do the heavy lifting. So, :class:`applyCl` can be thought of as a thin wrapper around that library, but still has the same consistent interface as :class:`apply` and :class:`applyMp`. From all of my tests so far, it seems that :class:`applyCl` works quite well and is quite robust, so if you have access to a cluster, use it over :class:`applyMp`. The library will connect to a Ray cluster automatically when you import everything using ``from k1lib.imports import *``. It will execute ``import ray; ray.init()``, which is quite simple. If you have ray installed, but does not want this default behavior, you can do this:: import k1lib k1lib.settings.startup.init_ray = False from k1lib.imports import * As with :class:`applyMp`, there are pitfalls and weird quirks to multiprocessing, on 1 or multiple nodes, so check out the docs over there to be aware of them, as those translates well to here. .. admonition:: Advanced use case Not really advanced, but just a bit difficult to understand/follow. Let's say that you want to scan through the home directory of all nodes, grab all files, read them, and get the number of bytes they have. You can do something like this:: a = None | applyCl.aS(lambda: None | cmd("ls ~") | filt(os.path.isfile) | deref()) | deref() b = a | ungroup(single=True, begin=True) | deref() c = b | applyCl(cat(text=False) | shape(0), pre=True) | deref() d = c | groupBy(0, True) | apply(item().all() | toSum(), 1) | deref() Noted, this is relatively complex. Let's see what A, B, C and D looks like:: # A [['7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d', ['Miniconda3-latest-Linux-x86_64.sh', 'mintupgrade-2023-04-01T232950.log']], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', ['5a', 'abc.jpg', 'a.txt']]] # B [['7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d', 'Miniconda3-latest-Linux-x86_64.sh'], ['7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d', 'mintupgrade-2023-04-01T232950.log'], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', '5a'], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 'abc.jpg'], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 'a.txt']] # C [['7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d', 74403966], ['7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d', 1065252], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 2601], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 16341], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 10177]] # D [['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 92185432], ['7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d', 75469218]] The steps we're concerned with is A and C. In step A, we're running 2 processes, 1 for each node, to get all the file names in the home directory. In step C, we're running 5 processes total, 2 on the first node and 3 on the second node. For each process, it's going to read as bytes and count up those bytes. Finally in step D, the results are grouped together and the sizes summed. So yeah, it's pretty nice that we did all of that in a relatively short amount of code. The data is distributed too (reading multiple files from multiple nodes), so we're truly not bottlenecked by anything. :param prefetch: if not specified, schedules all jobs at the same time. If specified, schedules jobs so that there'll only be a specified amount of jobs, and will only schedule more if results are actually being used. :param timeout: seconds to wait for job before raising an error :param bs: if specified, groups ``bs`` number of transforms into 1 job to be more efficient. :param rss: resources required for the task. Can be {"custom_resource1": 2} or "custom_resource1" as a shortcut :param pre: "preserve", same convention as :meth:`applyCl.aS`. If True, then allow passing through node ids as the first column to shedule jobs on those specific nodes only :param orPatch: whether to automatically patch __or__ function so that cli tools can work with numpy arrays on that remote worker :param num_cpus: how many cpu does each task take? :param kwargs: extra arguments to be passed to the function. ``args`` not included as there're a couple of options you can pass for this cli.""" super().__init__(fs=[f]); _fC = fastF(f); self.ogF = f; self.pre = pre self.rss = rss = {rss: 1} if isinstance(rss, str) else rss def remoteF(e): if orPatch: import k1lib; k1lib.cli.init.patchNumpy() k1lib.cli.init.patchDict(); k1lib.cli.init.patchPandas() return _fC(e, **kwargs) self.remoteF = remoteF; self.f = ray.remote(resources=rss, num_cpus=num_cpus)(remoteF) self.prefetch = prefetch or int(1e9) self.timeout = timeout; self.bs = bs self._copyCtx = lambda: [f, [prefetch, timeout, bs, rss, pre, orPatch, num_cpus], kwargs] def preprocessF(f, e): # return future (if pre=False), or [nodeId, future] (if pre=True) if pre: nodeId, e = e; return [nodeId, specificNode(f, nodeId).remote(e)] else: return f.remote(e) def resolveF(e): if pre: return [e[0], ray.get(e[1], timeout=timeout)] else: return ray.get(e, timeout=timeout) self.preprocessF = preprocessF; self.resolveF = resolveF
[docs] def __ror__(self, it): f = self.f; timeout = self.timeout; bs = self.bs; ogF = self.ogF; preprocessF = self.preprocessF; resolveF = self.resolveF if bs > 1: return it | cli.batched(bs, True) | applyCl(lambda x: x | apply(ogF) | cli.aS(list), self.prefetch, timeout) | cli.joinStreams() def gen(it): futures = deque(); it = iter(it) for i, e in zip(range(self.prefetch), it): futures.append(preprocessF(f, e)) for e in it: yield resolveF(futures.popleft()); futures.append(preprocessF(f, e)) for e in futures: yield resolveF(e) return gen(it)
[docs] def __invert__(self): """Expands the arguments out, just like :class:`apply`. Example:: # returns [20, 20, 18, 14, 8, 0, -10, -22, -36, -52] [range(10), range(20, 30)] | transpose() | ~applyCl(lambda x, y: y-x**2) | deref()""" f, rest, kwargs = self._copyCtx(); return applyCl(lambda x: f(*x), *rest, **kwargs)
[docs] @staticmethod def nodeIds(includeSelf=True) -> List[str]: """Returns a list of all node ids in the current cluster. Example:: applyCl.nodeIds() # returns something like ['7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d', '1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068'] If you want to get nodes' metadata, then just use ray's builtin function ``ray.nodes()`` :param includeSelf: whether to include node id of the current process or not""" res = ray.nodes() | cli.filt(lambda x: x["Alive"]) | apply(lambda x: x["NodeID"]) | aS(list) if includeSelf: return res res.remove(applyCl.nodeId()); return res
[docs] @staticmethod def nodeId() -> str: """Returns current node id""" return ray.runtime_context.get_runtime_context().get_node_id()
[docs] @staticmethod def meta() -> object: """Grabs the metadata object for the current node""" return ray.nodes() | cli.filt(lambda x: x["NodeID"] == applyCl.nodeId()) | cli.item()
[docs] @staticmethod def cpu() -> int: """Grabs the number of cpus available on this node""" return int(applyCl.meta()["Resources"]["CPU"])
[docs] @staticmethod def aS(f, timeout:float=8): """Executes function f once for all node ids that are piped in. Example:: # returns [['1051da...', ['Desktop', 'Downloads']], ['7bb387...', ['Pictures', 'Music']]] applyCl.nodeIds() | applyCl.aS(lambda: None | cmd("ls ~") | deref()) | deref() # also returns [['1051da...', ['Desktop', 'Downloads']], ['7bb387...', ['Pictures', 'Music']]] None | applyCl.aS(lambda: None | cmd("ls ~") | deref()) | deref() If you want to execute f for all nodes, you can pass in None instead. As a reminder, this kinda follows the same logic as the popular cli :class:`aS`, where f is executed once, hence the name "apply Single". Here, the meaning of "single" is different. It just means execute once for each node ids. :param f: main function to execute in each node. Not supposed to accept any arguments :param timeout: seconds to wait for job before raising an error""" f = fastF(f); g = lambda nodeId: specificNode(ray.remote(f), nodeId).remote() final = cli.iden() & (apply(g) | aS(list) | apply(ray.get, timeout=timeout)) | cli.transpose() return aS(lambda it: (applyCl.nodeIds() if it is None else it) | final)
[docs] @staticmethod def cmd(s:str, timeout:float=8, sudo=False): """Convenience function to execute shell command on all nodes. Example:: applyCl.cmd("mkdir -p /some/folder") It returns [[nodeid1, output1], [nodeid2, output2]]. If you need more flexibility, fall back to :meth:`applyCl.aS` :param s: shell command to execute :param sudo: if True, will execute the command with sudo privileges. Will ask for password and then cache it internally for 5 minutes""" global _password; import getpass if sudo: if _password() is None: print("Enter password:"); _password.value = getpass.getpass(prompt="") return None | applyCl.aS(lambda: _password() | cli.cmd(f"sudo -S {s}") | cli.deref(), timeout) | cli.deref() else: return None | applyCl.aS(lambda: None | cli.cmd(s) | cli.deref(), timeout) | cli.deref()
[docs] @staticmethod def replicateFile(fn:str, nodeIds=None): """Replicates a specific file in the current node to all the other nodes. Example:: applyCl.replicate("~/cron.log") Internally, this will read chunks of 100kB of the specified file and dump it incrementally to all other nodes, which has implications on performance. To increase or decrease it, check out :class:`~k1lib.cli.inp.cat`. This also means you can replicate arbitrarily large files around as long as you have the disk space for it, while ram size doesn't really matter :param fn: file name""" fn = os.path.expanduser(fn); dirname = os.path.dirname(fn) if nodeIds is None: nodeIds = applyCl.nodeIds(False) nodeIds = nodeIds | cli.wrapList().all() | cli.deref() nodeIds | cli.insert(None, False).all() | applyCl(lambda _: None | cli.cmd(f"mkdir -p {dirname}; rm {fn}") | cli.deref(), pre=True) | cli.deref() for chunk in cli.cat(fn, text=False, chunks=True): nodeIds | cli.insert(chunk, False).all() | applyCl(lambda chunk: chunk >> cli.file(fn) | cli.deref(), pre=True) | cli.deref()
[docs] @staticmethod def balanceFile(fn:str, nAs:List[str]=None, nBs:List[str]=None, rS=None): """Splits a specified file in node nAs and dumps other parts to nodes nBs. Example:: applyCl.balanceFile("~/cron.log") This will split the big files up into multiple segments (1 for each node). Then for each segment, it will read through it chunk by chunk into memory, and then deposits it into the respective nodes. Finally, it truncates the original files down to its segment boundary. The main goal of this is so that you can analyze a single big (say 200GB) file quickly. If that file is on a single node, then it will take forever, even with :class:`applyMp`. So splitting things up on multiple nodes will make analyzing it a lot faster. There's also the function :meth:`balanceFolder`, which has the opposite problem of having lots of small (say 100MB) files. So it will try to move files around (keeping them intact in the meantime) to different nodes so that the folder size ratio is roughly proportional to the cpu count. The exact split rule depends on the number of CPUs of each node. Best to see an example:: Command: applyCl.balanceFile("~/cron.log") Verbose command: applyCl.balanceFile("~/cron.log", ["1"], ["1", "2", "3", "4", "5"]) ----------- Before ----------- Node: 1 2 3 4 5 Cpu: 8 12 16 8 8 Size (GB): 52 0 0 0 0 ----------- After ----------- Node: 1 2 3 4 5 Cpu: 8 12 16 8 8 Size (GB): 8 12 16 8 8 This also works if you have files on existing nodes already, and are upgrading the cluster:: Command: applyCl.balanceFile("~/cron.log") Verbose command: applyCl.balanceFile("~/cron.log", ["1", "5"], ["1", "2", "3", "4", "5"]) ----------- Before ----------- Node: 1 2 3 4 5 Cpu: 8 12 16 8 8 Size (GB): 26 0 0 26 0 ----------- After ----------- Node: 1 2 3 4 5 Cpu: 8 12 16 8 8 Size (GB): 8 12 16 8 8 If you want to move files out of a node when decommissioning them, you can do something like this:: Command: applyCl.decommission("~/cron.log", ["3", "4"]) Verbose command: applyCl.balanceFile("~/cron.log", ["1", "2", "3", "4", "5"], ["1", "2", "5"]) ----------- Before ----------- Node: 1 2 3 4 5 Cpu: 8 12 16 8 8 Size (GB): 8 12 16 8 8 ----------- After ----------- Node: 1 2 3 4 5 Cpu: 8 12 16 8 8 Size (GB): 15 22 0 0 15 Remember that the node ids "1", etc. is for illustrative purposes only. You should get real node ids from :meth:`nodeIds`. Why is the file size proportional to the number of cores on each node? Well, if you have more cores, you should be able to process more, so as to make everything balanced, right? Again, this means that you can split arbitrarily large files as long as you have the disk space for it, ram size is not a concern. How does this perform? Not the best in the world if you don't have a lot of nodes. With sata 3 ssds, 750MB/s ethernet, I got transfer speeds of roughly 100MB/s. This should increase as you have more nodes based on the code structure, but I haven't tested it yet. Can it be faster? Definitely. Am I willing to spend time optimizing it? No. :param fn: file name :param nAs: node ids that currently stores the file. If not specified, try to detect what nodes the file exists in :param nBs: node ids that will store the file after balancing everything out. If not specified, will take all available nodes :param rS: :class:`~k1lib.cli.inp.refineSeek` instance, if you need more fine-grained control over section boundaries so as to not make everything corrupted """ from k1lib.cli._applyCl import balanceFile balanceFile(fn, nAs, nBs, rS or cli.iden())
[docs] def decommission(self, fn, nAs:List[str], rS=None): """Convenience function for :meth:`balanceFile`. See docs over there.""" from k1lib.cli._applyCl import balanceFile balanceFile(fn, None, applyCl.nodeIds() | ~cli.inSet(nAs) | deref(), rS or cli.iden())
[docs] @staticmethod def cat(fn:str=None, f:Callable=None, timeout:float=60, keepNodeIds:bool=False, multiplier:int=1, includeId:bool=False): """Reads a file distributedly, does some operation on them, collects and returns all of the data together. Example:: fn = "~/repos/labs/k1lib/k1lib/cli/test/applyCl.cat.data" ("0123456789"*5 + "\\n") * 1000 | file(fn) applyCl.splitFile(fn) applyCl.cat(fn, shape(0), keepNodeIds=True) | deref() That returns something like this (for a 2-node cluster, with 2 (node A) and 4 (node B) cpus respectively):: [['7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d', 167], ['7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d', 167], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 166], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 167], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 166], ['1051dafd2b0dac13561c46fe052f561400592f0723df2cd746a41068', 167]] Here, we're creating an initial file with 1000 lines. Then we'll split it up into 2 fragments: 334 lines and 667 lines and store them on the respective nodes. Then, on node A, we'll split the file up into 2 parts, each with 167 lines. On node B, we'll split the file up into 4 parts, each with around 166 lines. Then we'll schedule 6 processes total, each dealing with 166 lines. After all of that, results are collected together and returned. If you want to distinguish between different processes inside f, for example you want to write results into different files, you can do something like this:: dir_ = "~/repos/labs/k1lib/k1lib/cli/test" fn = f"{dir_}/applyCl.cat.data" applyCl.cmd(f"rm -r {dir_}/applyCl") # clear out old folders applyCl.cmd(f"mkdir -p {dir_}/applyCl") # creating folders # do processing on fn distributedly, then dump results into multiple files applyCl.cat(fn, ~aS(lambda idx, lines: lines | shape(0) | aS(dill.dumps) | file(f"{dir_}/applyCl/{idx}.pth")), includeId=True) | deref() # reading all files and summing them together None | applyCl.aS(lambda: ls(f"{dir_}/applyCl")) | ungroup(single=True, begin=True) | applyCl(cat(text=False) | aS(dill.loads), pre=True) | cut(1) | toSum() .. admonition:: Simple mode There's also another mode that's activated whenever f is not specified that feels more like vanilla :class:`~inp.cat`. Say you have a file on a specific node:: nodeId = "7bb387b2920694abe9f7d2a2ed939b6d31843faf91d174d0221e871d" fn = "~/ssd2/randomFile.txt" # -------------- file is on current node -------------- cat(fn) # returns iterator of lines inside the file fn | cat() # same thing as above # -------------- file is on remote node -------------- [nodeId, fn] | applyCl.cat() # returns iterator of lines of the file applyCl.cat([nodeId, fn]) # same thing nodeId | applyCl.cat(fn) # also same thing So yeah, there're lots of ways to just simply read a file on a remote node. Is it too much? Probably, but good thing is that you can pick any that's intuitive for you. Note that this mode is just for convenience only, for when you want to do exploratory analysis on a single remote file. To be efficient at bulk processing, use the normal mode instead. :param fn: file name :param f: function to execute in every process :param timeout: kills the processes if it takes longer than this amount of seconds :param keepNodeIds: whether to keep the node id column or not :param multiplier: by default, each node will spawn as many process as there are cpus. Sometimes you want to spawn more process, change this to a higher number :param includeId: includes a unique id for this process """ if f is None: # simple case def inner(nodeId_fn:Tuple[str, str]): nodeId, fn = nodeId_fn; seeks = [nodeId] | applyCl.aS(lambda: fn | cli.splitSeek(round(os.path.getsize(fn)/settings.cat.chunkSize+1))) | cli.cut(1) | cli.item() | cli.deref() inter = seeks | cli.window(2) | apply(cli.wrapList() | cli.insert(nodeId)) | cli.deref() return inter | ~applyCl(lambda sB, eB: cli.cat(fn,sB=sB,eB=eB) | cli.deref(), pre=True) | cli.cut(1) | cli.joinStreams() # return [nodeId_fn] | applyCl(cat() | deref(), pre=True) | cut(1) | item() # direct, no chunking method if fn is None: return aS(inner) # [nodeId, fn] | applyCl.cat() if isinstance(fn, str): return aS(lambda nodeId: inner([nodeId, fn])) # nodeId | applyCl.cat() else: return inner(fn) # applyCl.cat([nodeId, fn]) postprocess = cli.insertIdColumn(True, False) | ~apply(lambda x,y,z: [x,[*y,z]]) checkpoints = None | applyCl.aS(lambda: fn | cli.splitSeek(int(applyCl.meta()["Resources"]["CPU"]*multiplier)) | cli.window(2) | cli.deref()) | cli.ungroup(single=True, begin=True) | postprocess | cli.deref() postprocess = cli.iden() if keepNodeIds else cli.cut(1) return checkpoints | applyCl(~aS(lambda x,y,idx: cli.cat(fn, sB=x, eB=y) | ((cli.wrapList() | cli.insert(idx)) if includeId else cli.iden()) | f), pre=True, timeout=timeout, num_cpus=1) | postprocess
[docs] @staticmethod def balanceFolder(folder:str, maxSteps:int=None, audit:bool=False): """Balances all files within a folder across all nodes. Example:: base = "~/repos/labs/k1lib/k1lib/cli/test/applyCl.balance" # deletes old structures and making test folder applyCl.cmd(f"rm -r {base}"); applyCl.cmd(f"mkdir -p {base}") # creates 20 files of different sizes and dump it in the base folder of the current node torch.linspace(1e4, 1e5, 20).int() | apply(lambda x: "x"*x) | insertIdColumn() | ~apply(lambda idx, contents: contents | file(f"{base}/{idx}.txt")) | deref(); # transfers files between nodes such that the total folder size is proportional to the number of cpus across nodes applyCl.balanceFolder(base) # get folder size of all nodes None | applyCl.aS(lambda: ls(base) | apply(os.path.getsize) | toSum()) | deref() # creates 20 additional files and dump it to the current node torch.linspace(1e4, 1e5, 20).int() | apply(lambda x: "x"*x) | insertIdColumn() | ~apply(lambda idx, contents: contents | file(f"{base}/{idx+20}.txt")) | deref(); # balances the tree out again applyCl.balance(base) # get folder size of all nodes None | applyCl.aS(lambda: ls(base) | apply(os.path.getsize) | toSum()) | deref() So imagine that you just downloaded 1000 files to a single node on a specific folder, but you need to analyze all of them in a distributed manner. What you can do is to move some files to other nodes and then do your analysis. If you want to download more files, just dump it to any node (or download distributed across all nodes), then rebalance the folders and do your analysis. :param folder: folder to rebalance all of the files :param maxSteps: what's the maximum number of file transfers? By default has no limit, so that files are transferred until :param audit: if True, don't actually move files around and just return what files are going to be moved where""" from k1lib.cli._applyCl import balanceFolder return balanceFolder(folder, audit, maxSteps)
[docs] def download(url:str, folder:str, merge:bool=False, timeout=120, chunkTimeout=5): """Downloads a file distributedly to a specified folder. Example:: url = "https://vim.kelvinho.org" fn = "~/repos/labs/k1lib/k1lib/cli/test/applyCl.download" # file/folder name applyCl.download(url, fn) # will download distributedly and dump file fragments into the folder fn applyCl.download(url, fn, True) # same as above, but collects all fragments together, places it in fn in the current node, then deletes the temporary file fragments This only works if the server allows partial downloads btw. :param url: url to download :param folder: which folder to download parts into :param merge: whether to merge all of the fragments together into a single file in the current node or not :param timeout: timeout for each process :param chunkTimeout: timeout for each file chunk inside each process""" from k1lib.cli._applyCl import download download(url, folder, merge, timeout, chunkTimeout)
[docs] @staticmethod def diskScan(folder:str, raw=False): """Scans for files and folders in the specified folder for potential distributed files and folders. A distributed file is a file that exists on more than 1 node. A distributed folder is a folder that that exists on more than 1 node and does not have any shared children. Example:: applyCl.diskScan("~/ssd2") applyCl.diskScan("~/ssd2", True) The first line does not return anything, but will print out something like this: .. include:: ../literals/diskScan.rst While the second line will return a parseable data structure instead:: [[['/home/kelvin/ssd2/data/genome/RegulationFeatureActivity', [4113489746, 7912834090, 4164314316]], ['/home/kelvin/ssd2/data/genome/go/release_geneontology_org', [2071645117, 4172737915, 2107005131]], ['/home/kelvin/ssd2/data/genome/RegulationFeatureActivity.backup', [568878496, 552888466, 600610083]], ['/home/kelvin/ssd2/data/genome/00-common_all.idx', [341738564, 671136833, 0]], ['/home/kelvin/ssd2/data/genome/genbank/ch1.dat.gz', [25356744, 0, 25356764]], ['/home/kelvin/ssd2/test', [136152, 273530, 136351]], ['/home/kelvin/ssd2/data/genome/genbank/ch1', [0, 0, 0]]], [['/home/kelvin/ssd2/data/genome/dummy.txt', [1101, 1101, 1101]]], [['/home/kelvin/ssd2/data/genome/00-All.vcf', [32737509360, 65475018903, 32737509588]], ['/home/kelvin/ssd2/data/genome/MotifFeatures/homo_sapiens.GRCh38.motif_features.gff', [13963854962, 27927709895, 13963854962]], ['/home/kelvin/ssd2/data/genome/00-common_all.vcf', [2353901811, 4707803470, 2353901831]]]] Remember that since an operating system usually have lots of shared files (like "~/.bashrc", for example), these might be mistaken as a distributed file. Make sure to only scan folders that you store data in, or else it'll take a long time to return. :param folder: the folder to scan through :param raw: whether to return raw data or display it out nicely""" from k1lib.cli._applyCl import diskScan3, diskScan4 if size: return diskScan4(folder) else: return diskScan3(folder)
thEmptySentinel = object()
[docs]class applyTh(BaseCli):
[docs] def __init__(self, f, prefetch:int=None, timeout:float=5, bs:int=1, **kwargs): """Kinda like the same as :class:`applyMp`, but executes ``f`` on multiple threads, instead of on multiple processes. Advantages: - Relatively low overhead for thread creation - Fast, if ``f`` is io-bound - Does not have to serialize and deserialize the result, meaning iterators can be exchanged Disadvantages: - Still has thread creation overhead, so it's still recommended to specify ``bs`` - Is slow if ``f`` has to obtain the GIL to be able to do anything All examples from :class:`applyMp` should work perfectly here.""" fs = [f]; super().__init__(fs=fs); self.f = fs[0]; self.bs = bs; self.kwargs = kwargs self.prefetch = prefetch or int(1e9); self.timeout = timeout
[docs] def __ror__(self, it): if self.bs > 1: yield from (it | cli.batched(self.bs, True) | applyTh(apply(self.f), self.prefetch, self.timeout) | cli.joinStreams()); return datas = deque(); it = iter(it); kwargs = self.kwargs innerF = fastF(self.f); timeout = self.timeout def f(line, wrapper): wrapper.value = innerF(line, **kwargs) for _, line in zip(range(self.prefetch), it): w = k1lib.Wrapper(thEmptySentinel) t = threading.Thread(target=f, args=(line,w)) t.start(); datas.append((t, w)) for line in it: data = datas.popleft(); data[0].join(timeout) if data[1].value is thEmptySentinel: for data in datas: data[0].join(0.01) raise RuntimeError("Thread timed out!") yield data[1].value; w = k1lib.Wrapper(thEmptySentinel) t = threading.Thread(target=f, args=(line,w)) t.start(); datas.append((t, w)) for i in range(len(datas)): # do it this way so that python can remove threads early, due to ref counting data = datas.popleft(); data[0].join(timeout) if data[1].value is thEmptySentinel: for data in datas: data[0].join(0.01) raise RuntimeError("Thread timed out!") yield data[1].value
def _copy(self): return applyTh(self.f, self.prefetch, self.timeout, self.bs, **self.kwargs)
[docs] def __invert__(self): res = self._copy(); f = fastF(res.f) kw = res.kwargs res.f = lambda x: f(*x, **kw) res.kwargs = {} return res
[docs]class applySerial(BaseCli):
[docs] def __init__(self, f, *args, **kwargs): """Applies a function repeatedly. First yields input iterator ``x``. Then yields ``f(x)``, then ``f(f(x))``, then ``f(f(f(x)))`` and so on. Example:: # returns [2, 4, 8, 16, 32] 2 | applySerial(op()*2) | head(5) | deref() If the result of your operation is an iterator, you might want to :class:`~k1lib.cli.utils.deref` it, like this:: rs = iter(range(8)) | applySerial(rows()[::2]) # returns [0, 2, 4, 6] rs | rows(1) | item() | deref() # returns []. This is because all the elements are taken by the previous deref() rs | item() | deref() # returns [[2, 8], [10, -6], [4, 16], [20, -12]] [2, 8] | ~applySerial(lambda a, b: (a + b, a - b)) | head(4) | deref() rs = iter(range(8)) | applySerial(rows()[::2] | deref()) # returns [0, 2, 4, 6] rs | rows(1) | item() # returns [0, 4] rs | item() # or `next(rs)` # returns [0] rs | item() # or `next(rs)` :param f: function to apply repeatedly""" fs = [f]; super().__init__(fs=fs); self.f = fs[0] self.unpack = False; self.args = args; self.kwargs = kwargs
[docs] def __ror__(self, it): f = fastF(self.f) if self.unpack: while True: yield it; it = f(*it, *self.args, **self.kwargs) else: while True: yield it; it = f(it, *self.args, **self.kwargs)
[docs] def __invert__(self): ans = applySerial(self.f, *self.args, **self.kwargs) ans.unpack = True; return ans
[docs]class sort(BaseCli):
[docs] def __init__(self, column:int=0, numeric=True, reverse=False): """Sorts all lines based on a specific `column`. Example:: # returns [[5, 'a'], [1, 'b']] [[1, "b"], [5, "a"]] | ~sort(0) | deref() # returns [[2, 3]] [[1, "b"], [5, "a"], [2, 3]] | ~sort(1) | deref() # errors out, as you can't really compare str with int [[1, "b"], [2, 3], [5, "a"]] | sort(1, False) | deref() # returns [-1, 2, 3, 5, 8] [2, 5, 3, -1, 8] | sort(None) | deref() :param column: if None, sort rows based on themselves and not an element :param numeric: whether to convert column to float :param reverse: False for smaller to bigger, True for bigger to smaller. Use :meth:`__invert__` to quickly reverse the order instead of using this param""" self.column = column; self.reverse = reverse; self.numeric = numeric self.filterF = (lambda x: float(x)) if numeric else (lambda x: x)
[docs] def __ror__(self, it:Iterator[str]): c = self.column if c is None: return it | cli.wrapList() | cli.transpose() | sort(0, self.numeric, self.reverse) | cli.op()[0].all() f = self.filterF rows = (it | cli.isNumeric(c) if self.numeric else it) | cli.apply(list) def sortF(row): if len(row) > c: return f(row[c]) return float("inf") return sorted(rows, key=sortF, reverse=self.reverse)
[docs] def __invert__(self): """Creates a clone that has the opposite sort order""" return sort(self.column, self.numeric, not self.reverse)
[docs]class sortF(BaseCli):
[docs] def __init__(self, f:Callable[[T], float], column:int=None, reverse=False): """Sorts rows using a function. Example:: # returns ['a', 'aa', 'aaa', 'aaaa', 'aaaaa'] ["a", "aaa", "aaaaa", "aa", "aaaa"] | sortF(lambda r: len(r)) | deref() # returns ['aaaaa', 'aaaa', 'aaa', 'aa', 'a'] ["a", "aaa", "aaaaa", "aa", "aaaa"] | ~sortF(lambda r: len(r)) | deref()""" fs = [f]; super().__init__(fs=fs); self.f = fs[0] self.column = column; self.reverse = reverse
[docs] def __ror__(self, it:Iterator[T]) -> Iterator[T]: c = self.column; f = self.f if c is None: return sorted(list(it), key=f, reverse=self.reverse) def sortF(row): if len(row) > c: return f(row[c]) return float("inf") return sorted(list(it), key=sortF, reverse=self.reverse)
[docs] def __invert__(self) -> "sortF": return sortF(self.f, self.column, not self.reverse)
[docs]class consume(BaseCli):
[docs] def __init__(self, f:Union[BaseCli, Callable[[T], None]]): r"""Consumes the iterator in a side stream. Returns the iterator. Kinda like the bash command ``tee``. Example:: # prints "0\n1\n2" and returns [0, 1, 2] range(3) | consume(headOut()) | toList() # prints "range(0, 3)" and returns [0, 1, 2] range(3) | consume(lambda it: print(it)) | toList() This is useful whenever you want to mutate something, but don't want to include the function result into the main stream. See also: :class:`~k1lib.cli.output.tee`""" fs = [f]; super().__init__(fs=fs); self.f = fs[0]
[docs] def __ror__(self, it:T) -> T: self.f(it); return it
[docs]class randomize(BaseCli):
[docs] def __init__(self, bs=100, seed=None): """Randomize input stream. In order to be efficient, this does not convert the input iterator to a giant list and yield random values from that. Instead, this fetches ``bs`` items at a time, randomizes them, returns and fetch another ``bs`` items. If you want to do the giant list, then just pass in ``float("inf")``, or ``None``. Example:: # returns [0, 1, 2, 3, 4], effectively no randomize at all range(5) | randomize(1) | deref() # returns something like this: [1, 0, 2, 3, 5, 4, 6, 8, 7, 9]. You can clearly see the batches range(10) | randomize(3) | deref() # returns something like this: [7, 0, 5, 2, 4, 9, 6, 3, 1, 8] range(10) | randomize(float("inf")) | deref() # same as above range(10) | randomize(None) | deref() # returns True, as the seed is the same range(10) | randomize(seed=4) | deref() == range(10) | randomize(seed=4) | deref()""" self.bs = bs if bs != None else float("inf"); self.seed = seed; self._initGenn()
def _initGenn(self): if hasTorch: gen = torch.Generator().manual_seed(random.Random(self.seed).getrandbits(63)) self.genn = lambda n: torch.randperm(n, generator=gen) else: self.genn = np.random.permutation def __getstate__(self): genn = self.genn; self.genn = None; return self.__dict__ def __setstate__(self, d): self.__dict__.update(d); self._initGenn()
[docs] def __ror__(self, it:Iterator[T]) -> Iterator[T]: bs = self.bs if isinstance(it, settings.arrayTypes): if bs is None or len(it) <= bs: return it if len(it) == 1 else it[self.genn(len(it))] def gen(): for batch in it | cli.batched(bs, True): batch = list(batch); perms = self.genn(len(batch)) for idx in perms: yield batch[idx] return gen()
class StaggeredStream: def __init__(self, stream:Iterator[T], every:int): """Not intended to be instantiated by the end user. Use :class:`stagger` instead.""" self.stream = stream; self.every = every def __iter__(self): for i, v in zip(range(self.every), self.stream): yield v def __len__(self): """Length of window (length of result if you were to deref it).""" return self.every
[docs]class stagger(BaseCli):
[docs] def __init__(self, every:int): """Staggers input stream into multiple stream "windows" placed serially. Best explained with an example:: o = range(10) | stagger(3) o | deref() # returns [0, 1, 2], 1st "window" o | deref() # returns [3, 4, 5], 2nd "window" o | deref() # returns [6, 7, 8] o | deref() # returns [9] o | deref() # returns [] This might be useful when you're constructing a data loader:: dataset = [range(20), range(30, 50)] | transpose() dl = dataset | batched(3) | (transpose() | toTensor()).all() | stagger(4) for epoch in range(3): for xb, yb in dl: # looping over a window print(epoch) # then something like: model(xb) The above code will print 6 lines. 4 of them is "0" (because we stagger every 4 batches), and xb's shape' will be (3,) (because we batched every 3 samples). You should also keep in mind that this doesn't really change the property of the stream itself. Essentially, treat these pairs of statement as being the same thing:: o = range(11, 100) # both returns 11 o | stagger(20) | item() o | item() # both returns [11, 12, ..., 20] o | head(10) | deref() o | stagger(20) | head(10) | deref() Lastly, multiple iterators might be getting values from the same stream window, meaning:: o = range(11, 100) | stagger(10) it1 = iter(o); it2 = iter(o) next(it1) # returns 11 next(it2) # returns 12 This may or may not be desirable. Also this should be obvious, but I want to mention this in case it's not clear to you.""" self.every = int(every)
[docs] def __ror__(self, it:Iterator[T]) -> StaggeredStream: return StaggeredStream(iter(it), self.every)
[docs] @staticmethod def tv(every:int, ratio:float=0.8): """Convenience method to quickly stagger train and valid datasets. Example:: # returns [[16], [4]] [range(100)]*2 | stagger.tv(20) | shape().all() | deref()""" return stagger(round(every*ratio)) + stagger(round(every*(1-ratio)))
compareOps = {"__lt__", "__le__", "__eq__", "__ne__", "__gt__", "__ge__"}
[docs]class op(k1lib.Absorber, BaseCli):
[docs] def __init__(self): """Absorbs operations done on it and applies it on the stream. Based on :class:`~k1lib.Absorber`. Example:: # returns 16 4 | op()**2 # returns 16, equivalent to the above 4 | aS(lambda x: x**2) # returns [0, 1, 4, 9, 16] range(5) | apply(op()**2) | deref() # returns [0, 1, 4, 9, 16], equivalent to the above range(5) | apply(lambda x: x**2) | deref() Main advantage is that you don't have to waste keystrokes when you just want to do a simple operation. How it works underneath is a little magical, so just treat it as a blackbox. A more complex example:: t = torch.tensor([[1, 2, 3], [4, 5, 6.0]]) # returns [torch.tensor([[4., 5., 6., 7., 8., 9.]])] [t] | (op() + 3).view(1, -1).all() | deref() Basically, you can treat ``op()`` as the input tensor. Tbh, you can do the same thing with this:: [t] | applyS(lambda t: (t+3).view(-1, 1)).all() | deref() But that's kinda long and may not be obvious. This can be surprisingly resilient, as you can still combine with other cli tools as usual, for example:: # returns [2, 3], demonstrating "&" operator torch.randn(2, 3) | (op().shape & iden()) | deref() | item() a = torch.tensor([[1, 2, 3], [7, 8, 9]]) # returns torch.tensor([4, 5, 6]), demonstrating "+" operator for clis and not clis (a | op() + 3 + iden() | item() == torch.tensor([4, 5, 6])).all() # returns [[3], [3]], demonstrating .all() and "|" serial chaining torch.randn(2, 3) | (op().shape.all() | deref()) # returns [[8, 18], [9, 19]], demonstrating you can treat `op()` as a regular function [range(10), range(10, 20)] | transpose() | filt(op() > 7, 0) | deref() # returns [3, 4, 5, 6, 7, 8, 9], demonstrating bounds comparison range(100) | filt(3 <= op() < 10) | deref() This can only deal with simple operations only. For complex operations, resort to the longer version ``aS(lambda x: ...)`` instead! There are also operations that are difficult to achieve, like ``len(op())``, as Python is expecting an integer output, so ``op()`` can't exactly take over. Instead, you have to use :class:`aS`, or do ``op().ab_len()``. Get a list of all of these special operations in the source of :class:`~k1lib.Absorber`. Performance-wise, in most cases, there are no degradation, so don't worry about it. Everything is pretty much on par with native lambdas:: n = 10_000_000 # takes 1.48s for i in range(n): i**2 # takes 1.89s, 1.28x worse than for loop range(n) | apply(lambda x: x**2) | ignore() # takes 1.86s, 1.26x worse than for loop range(n) | apply(op()**2) | ignore() # takes 1.86s range(n) | (op()**2).all() | ignore() More complex operations still retains the same speeds, as there's a JIT compiler embedded in:: # takes 2.15s for i in range(n): (i**2-3)*0.1 # takes 2.53s, 1.18x worse than for loop range(n) | apply(lambda x: (x**2-3)*0.1) | ignore() # takes 2.46s, 1.14x worse than for loop range(n) | apply((op()**2-3)*0.1) | ignore() Reserved operations that are not absorbed are: - all - __ror__ (__or__ still works!) - ab_solidify - op_hint""" super().__init__({"_hint": None})
[docs] @staticmethod def solidify(f): """Static equivalent of ``a.ab_solidify()``. Example:: f = op()**2 f = op.solidify(f) If ``f`` is not an ``op``, then just return it without doing anything to it""" if f.__class__.__name__.split(".")[-1] == "op": f.ab_solidify() return f
[docs] def __ror__(self, it): return self.ab_operate(it)
def __or__(self, o): if isinstance(o, BaseCli): return super(k1lib.Absorber, self).__or__(o) return super().__add__(o) def __add__(self, o): if isinstance(o, BaseCli): return super(k1lib.Absorber, self).__add__(o) return super().__add__(o) def __and__(self, o): if isinstance(o, BaseCli): return super(k1lib.Absorber, self).__and__(o) return super().__and__(o) def __call__(self, *args, **kwargs): if self._ab_solidified: return self.ab_operate(*args, **kwargs) return super().__call__(*args, **kwargs) def _typehint(self, inp): return self._hint if self._hint is not None else tAny()
[docs] def op_hint(self, _hint): """Specify output type hint""" self._ab_sentinel = True; self._hint = _hint self._ab_sentinel = False; return self
cli.op = op
[docs]class integrate(BaseCli):
[docs] def __init__(self, dt=1): """Integrates the input. Example:: # returns [0, 1, 3, 6, 10, 15, 21, 28, 36, 45] range(10) | integrate() | deref() # returns [0, 2, 6, 12, 20, 30, 42, 56, 72, 90] range(10) | integrate(2) | deref() :param dt: Optional small step""" self.dt = dt
[docs] def __ror__(self, it): if self.dt == 1: s = 0 for e in it: s += e; yield s else: dt = self.dt; s = 0 for e in it: s += e*dt; yield s