Source code for k1lib.cli.modifier

# AUTOGENERATED FILE! PLEASE DON'T EDIT
"""
This is for quick modifiers, think of them as changing formats
"""
__all__ = ["apply", "applyMp", "applyMpBatched", "applyCached", "applyS",
           "replace", "remove", "toFloat", "toInt",
           "sort", "sortF", "consume", "randomize", "stagger", "op"]
from typing import Callable, Iterator, Any, Union, List
from k1lib.cli.init import patchDefaultDelim, BaseCli, settings, T
import k1lib.cli as cli, numpy as np, torch
import torch.multiprocessing as mp; from collections import deque
from functools import partial, update_wrapper
import dill, pickle, k1lib, warnings, atexit
def executeFunc(common, line):
    import dill
    f, kwargs = dill.loads(common)
    return f(dill.loads(line), **kwargs)
[docs]class applyMp(BaseCli): _pools = set()
[docs] def __init__(self, f:Callable[[T], T], prefetch:int=None, timeout:float=2, **kwargs): """Like :class:`apply`, but execute ``f(row)`` of each row in multiple processes. Example:: # returns [3, 2] ["abc", "de"] | applyMp(lambda s: len(s)) | deref() # returns [5, 6, 9] range(3) | applyMp(lambda x, bias: x**2+bias, bias=5) | deref() # returns [[1, 2, 3], [1, 2, 3]], demonstrating outside vars work someList = [1, 2, 3] ["abc", "de"] | applyMp(lambda s: someList) | deref() Internally, this will continuously spawn new jobs up until 80% of all CPU cores are utilized. On posix systems, the default multiprocessing start method is ``fork()``. This sort of means that all the variables in memory will be copied over. This might be expensive (might also not, with copy-on-write), so you might have to think about that. On windows and macos, the default start method is ``spawn``, meaning each child process is a completely new interpreter, so you have to pass in all required variables and reimport every dependencies. Read more at https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods If you don't wish to schedule all jobs at once, you can specify a ``prefetch`` amount, and it will only schedule that much jobs ahead of time. Example:: range(10000) | applyMp(lambda x: x**2) | head() | deref() # 700ms range(10000) | applyMp(lambda x: x**2, 5) | head() | deref() # 300ms # demonstrating there're no huge penalties even if we want all results at the same time range(10000) | applyMp(lambda x: x**2) | deref() # 900ms range(10000) | applyMp(lambda x: x**2, 5) | deref() # 1000ms The first line will schedule all jobs at once, and thus will require more RAM and compute power, even though we discard most of the results anyway (the :class:`~k1lib.cli.filt.head` cli). The second line only schedules 5 jobs ahead of time, and thus will be extremely more efficient if you don't need all results right away. .. note:: Remember that every :class:`~k1lib.cli.init.BaseCli` is also a function, meaning that you can do stuff like:: # returns [['ab', 'ac']] [["ab", "cd", "ac"]] | applyMp(filt(op().startswith("a")) | deref()) | deref() Also remember that the return result of ``f`` should not be a generator. That's why in the example above, there's a ``deref()`` inside f. Most of the time, you'd probably want to use :class:`applyMpBatched` instead. That cli tool has the same look and feel as this, but executes ``f`` multiple times in a single job, instead of executing ``f`` only 1 time per job here, so should dramatically improve performance for most workloads. One last thing. Remember to close all pools (using :meth:`clearPools`) before exiting the script so that all child processes are terminated, and that resources are freed. Let's say if you use CUDA tensors, but have not close all pools yet, then it is possible that CUDA memory is not freed. I learned this the hard way. I've tried to use :mod:`atexit` to close pools automatically, but it doesn't seem to work with notebooks. :param prefetch: if not specified, schedules all jobs at the same time. If specified, schedules jobs so that there'll only be a specified amount of jobs, and will only schedule more if results are actually being used. :param timeout: seconds to wait for job before raising an error :param kwargs: extra arguments to be passed to the function. ``args`` not included as there're a couple of options you can pass for this cli.""" super().__init__(fs=[f]); self.f = f; self.prefetch = prefetch or 1_000_000 self.timeout = timeout; self.kwargs = kwargs
[docs] def __ror__(self, it:Iterator[T]) -> Iterator[T]: super().__ror__(it); it = iter(it) # really make sure it's an iterator, for prefetch self.p = p = mp.Pool(mp.cpu_count()*4//5) applyMp._pools.add(p); timeout = self.timeout common = dill.dumps([self.f, self.kwargs]) def gen(): fs = deque() for i, line in zip(range(self.prefetch), it): fs.append(p.apply_async(executeFunc, [common, dill.dumps(line)])) for line in it: yield fs.popleft().get(timeout) fs.append(p.apply_async(executeFunc, [common, dill.dumps(line)])) for f in fs: yield f.get(timeout) return gen()
[docs] @staticmethod def clearPools(): """Terminate all existing pools. Do this before restarting/quitting the script/notebook to make sure all resources (like GPU) are freed.""" for p in applyMp._pools: try: p.terminate() except: pass applyMp._pools = set()
[docs] @staticmethod def pools(): """Get set of all pools. Meant for debugging purposes only.""" return applyMp._pools
atexit.register(lambda: applyMp.clearPools())
[docs]class applyS(BaseCli):
[docs] def __init__(self, f:Callable[[T], T]): """Like :class:`apply`, but much simpler, just operating on the entire input object, essentially. The "S" stands for "single". Example:: # returns 5 3 | applyS(lambda x: x+2) Like :class:`apply`, you can also use this as a decorator like this:: @applyS def f(x): return x+2 # returns 5 3 | f This also decorates the returned object so that it has same qualname, docstring and whatnot.""" super().__init__(fs=[f]); self.f = f update_wrapper(self, f)
[docs] def __ror__(self, it:T) -> T: return self.f(it)
[docs] def all(self): return apply(self.f)
[docs]class apply(BaseCli):
[docs] def __init__(self, f:Callable[[str], str], column:int=None): """Applies a function f to every line. Example:: # returns [0, 1, 4, 9, 16] range(5) | apply(lambda x: x**2) | deref() # returns [[3.0, 1.0, 1.0], [3.0, 1.0, 1.0]] torch.ones(2, 3) | apply(lambda x: x+2, 0) | deref() You can also use this as a decorator, like this:: @apply def f(x): return x**2 # returns [0, 1, 4, 9, 16] range(5) | f | deref() :param column: if not None, then applies the function to that column only""" super().__init__(fs=[f]); self.f = f.f if isinstance(f, applyS) else f self.column = column
[docs] def __ror__(self, it:Iterator[str]): super().__ror__(it); f = self.f; c = self.column if isinstance(f, cli.op): f = f.ab_operate if c is None: return (f(line) for line in it) else: return ([(e if i != c else f(e)) for i, e in enumerate(row)] for row in it)
[docs]def applyMpBatched(f, bs=32, prefetch=2, timeout=5): """Pretty much the same as :class:`applyMp` and has the same feel to it too. Iterator[A] goes in, Iterator[B] goes out, and you specify `f(A) -> B`. However, this will launch jobs that will execute multiple f(), instead of 1 job per execution. All examples from :class:`applyMp` should work perfectly here.""" return cli.batched(bs, True) | applyMp(apply(f) | cli.deref(), prefetch, timeout) | cli.joinStreams()
[docs]class applyCached(BaseCli):
[docs] def __init__(self, f, limit:int=1000): """Like :class:`apply`, but caches the results, so subsequent requests are faster. All examples from :class:`apply` should work. Example:: # returns [0, 1, 4, 9, 16, 0, 1, 4, 9, 16] [*range(5), *range(5)] | applyCached(lambda x: x**2) | cli.deref() I'm thinking about just adding a ``cacheLimit`` argument to :class:`apply`, and have it integrate with everything. However, this feature doesn't seem useful enough yet. May be in a future version. :param limit: max cache size""" super().__init__(fs=[f]); self.f = f self.limit = limit; self.lookup = dict()
[docs] def __ror__(self, it): lookup = self.lookup; f = self.f; limit = self.limit for e in it: if e not in lookup: lookup[e] = f(e) yield lookup[e] if len(lookup) > limit: del a[next(iter(a.keys()))]
[docs]def replace(s:str, target:str=None, column:int=None): """Replaces substring `s` with `target` for each line. Example:: # returns ['104', 'ab0c'] ["1234", "ab23c"] | replace("23", "0") | deref() :param target: if not specified, then use the default delimiter specified in ``cliSettings``""" t = patchDefaultDelim(target) return apply(lambda e: e.replace(s, t), column)
[docs]def remove(s:str, column:int=None): """Removes a specific substring in each line.""" return replace(s, "", column)
def _op(toOp, c, force, defaultValue): return apply(toOp, c) | (apply(lambda x: x or defaultValue, c) if force else (~cli.isValue(None, c))) def _toFloat(e) -> Union[float, None]: try: return float(e) except: return None
[docs]def toFloat(*columns:List[int], force=False): """Converts every row into a float. Example:: # returns [1, 3, -2.3] ["1", "3", "-2.3"] | toFloat() | deref() # returns [[1.0, 'a'], [2.3, 'b'], [8.0, 'c']] [["1", "a"], ["2.3", "b"], [8, "c"]] | toFloat(0) | deref() With weird rows:: # returns [[1.0, 'a'], [8.0, 'c']] [["1", "a"], ["c", "b"], [8, "c"]] | toFloat(0) | deref() # returns [[1.0, 'a'], [0.0, 'b'], [8.0, 'c']] [["1", "a"], ["c", "b"], [8, "c"]] | toFloat(0, force=True) | deref() :param columns: if nothing, then will convert each row. If available, then convert all the specified columns :param force: if True, forces weird values to 0.0, else filters out all weird rows""" if len(columns) > 0: return cli.init.serial(*(_op(_toFloat, c, force, 0.0) for c in columns)) else: return _op(_toFloat, None, force, 0.0)
def _toInt(e) -> Union[int, None]: try: return int(float(e)) except: return None
[docs]def toInt(*columns:List[int], force=False): """Converts every row into an integer. Example:: # returns [1, 3, -2] ["1", "3", "-2.3"] | toInt() | deref() :param columns: if nothing, then will convert each row. If available, then convert all the specified columns :param force: if True, forces weird values to 0, else filters out all weird rows See also: :meth:`toFloat`""" if len(columns) > 0: return cli.init.serial(*(_op(_toInt, c, force, 0) for c in columns)) else: return _op(_toInt, None, force, 0)
[docs]class sort(BaseCli):
[docs] def __init__(self, column:int=0, numeric=True, reverse=False): """Sorts all lines based on a specific `column`. Example:: # returns [[5, 'a'], [1, 'b']] [[1, "b"], [5, "a"]] | ~sort(0) | deref() # returns [[2, 3]] [[1, "b"], [5, "a"], [2, 3]] | ~sort(1) | deref() # errors out, as you can't really compare str with int [[1, "b"], [2, 3], [5, "a"]] | sort(1, False) | deref() :param column: if None, sort rows based on themselves and not an element :param numeric: whether to convert column to float :param reverse: False for smaller to bigger, True for bigger to smaller. Use :meth:`__invert__` to quickly reverse the order instead of using this param""" super().__init__() self.column = column; self.reverse = reverse; self.numeric = numeric self.filterF = (lambda x: float(x)) if numeric else (lambda x: x)
[docs] def __ror__(self, it:Iterator[str]): super().__ror__(it); c = self.column if c is None: return it | cli.wrapList() | cli.transpose() | sort(0, self.numeric, self.reverse) f = self.filterF rows = (it | cli.isNumeric(c) if self.numeric else it) | cli.deref(maxDepth=2) def sortF(row): if len(row) > c: return f(row[c]) return float("inf") return iter(sorted(rows, key=sortF, reverse=self.reverse))
[docs] def __invert__(self): """Creates a clone that has the opposite sort order""" return sort(self.column, self.numeric, not self.reverse)
[docs]class sortF(BaseCli):
[docs] def __init__(self, f:Callable[[T], float], reverse=False): """Sorts rows using a function. Example:: # returns ['a', 'aa', 'aaa', 'aaaa', 'aaaaa'] ["a", "aaa", "aaaaa", "aa", "aaaa"] | sortF(lambda r: len(r)) | deref() # returns ['aaaaa', 'aaaa', 'aaa', 'aa', 'a'] ["a", "aaa", "aaaaa", "aa", "aaaa"] | ~sortF(lambda r: len(r)) | deref()""" super().__init__(fs=[f]); self.f = f; self.reverse = reverse
[docs] def __ror__(self, it:Iterator[T]) -> Iterator[T]: super().__ror__(it) return iter(sorted(list(it), key=self.f, reverse=self.reverse))
[docs] def __invert__(self) -> "sortF": return sortF(self.f, not self.reverse)
[docs]class consume(BaseCli):
[docs] def __init__(self, f:Union[BaseCli, Callable[[T], None]]): r"""Consumes the iterator in a side stream. Returns the iterator. Kinda like the bash command ``tee``. Example:: # prints "0\n1\n2" and returns [0, 1, 2] range(3) | consume(headOut()) | toList() # prints "range(0, 3)" and returns [0, 1, 2] range(3) | consume(lambda it: print(it)) | toList() This is useful whenever you want to mutate something, but don't want to include the function result into the main stream.""" super().__init__(fs=[f]); self.f = f
[docs] def __ror__(self, it:T) -> T: super().__ror__(it); self.f(it); return it
[docs]class randomize(BaseCli):
[docs] def __init__(self, bs=100): """Randomize input stream. In order to be efficient, this does not convert the input iterator to a giant list and yield random values from that. Instead, this fetches ``bs`` items at a time, randomizes them, returns and fetch another ``bs`` items. If you want to do the giant list, then just pass in ``float("inf")``, or ``None``. Example:: # returns [0, 1, 2, 3, 4], effectively no randomize at all range(5) | randomize(1) | deref() # returns something like this: [1, 0, 2, 3, 5, 4, 6, 8, 7, 9]. You can clearly see the batches range(10) | randomize(3) | deref() # returns something like this: [7, 0, 5, 2, 4, 9, 6, 3, 1, 8] range(10) | randomize(float("inf")) | deref() # same as above range(10) | randomize(None) | deref()""" super().__init__(); self.bs = bs if bs != None else float("inf")
[docs] def __ror__(self, it:Iterator[T]) -> Iterator[T]: super().__ror__(it) for batch in it | cli.batched(self.bs, True): batch = list(batch); perms = torch.randperm(len(batch)) for idx in perms: yield batch[idx]
class StaggeredStream: def __init__(self, stream:Iterator[T], every:int): """Not intended to be instantiated by the end user. Use :class:`stagger` instead.""" self.stream = stream; self.every = every def __iter__(self): for i, v in zip(range(self.every), self.stream): yield v def __len__(self): """Length of window (length of result if you were to deref it).""" return self.every
[docs]class stagger(BaseCli): """Staggers input stream into multiple stream "windows" placed serially. Best explained with an example:: o = range(10) | stagger(3) o | deref() # returns [0, 1, 2], 1st "window" o | deref() # returns [3, 4, 5], 2nd "window" o | deref() # returns [6, 7, 8] o | deref() # returns [9] o | deref() # returns [] This might be useful when you're constructing a data loader:: dataset = [range(20), range(30, 50)] | transpose() dl = dataset | batched(3) | (transpose() | toTensor()).all() | stagger(4) for epoch in range(3): for xb, yb in dl: # looping over a window print(epoch) # then something like: model(xb) The above code will print 6 lines. 4 of them is "0" (because we stagger every 4 batches), and xb's shape' will be (3,) (because we batched every 3 samples). You should also keep in mind that this doesn't really change the property of the stream itself. Essentially, treat these pairs of statement as being the same thing:: o = range(11, 100) # both returns 11 o | stagger(20) | item() o | item() # both returns [11, 12, ..., 20] o | head(10) | deref() o | stagger(20) | head(10) | deref() Lastly, multiple iterators might be getting values from the same stream window, meaning:: o = range(11, 100) | stagger(10) it1 = iter(o); it2 = iter(o) next(it1) # returns 11 next(it2) # returns 12 This may or may not be desirable. Also this should be obvious, but I want to mention this in case it's not clear to you.""" def __init__(self, every:int): self.every = int(every)
[docs] def __ror__(self, it:Iterator[T]) -> StaggeredStream: return StaggeredStream(iter(it), self.every)
[docs]class op(k1lib.Absorber, BaseCli): """Absorbs operations done on it and applies it on the stream. Based on :class:`~k1lib.Absorber`. Example:: t = torch.tensor([[1, 2, 3], [4, 5, 6.0]]) # returns [torch.tensor([[4., 5., 6., 7., 8., 9.]])] [t] | (op() + 3).view(1, -1).all() | deref() Basically, you can treat ``op()`` as the input tensor. Tbh, you can do the same thing with this:: [t] | applyS(lambda t: (t+3).view(-1, 1)).all() | deref() But that's kinda long and may not be obvious. This can be surprisingly resilient, as you can still combine with other cli tools as usual, for example:: # returns [2, 3], demonstrating "&" operator torch.randn(2, 3) | (op().shape & identity()) | deref() | item() a = torch.tensor([[1, 2, 3], [7, 8, 9]]) # returns torch.tensor([4, 5, 6]), demonstrating "+" operator for clis and not clis (a | op() + 3 + identity() | item() == torch.tensor([4, 5, 6])).all() # returns [[3], [3]], demonstrating .all() and "|" serial chaining torch.randn(2, 3) | (op().shape.all() | deref()) # returns [[8, 18], [9, 19]], demonstrating you can treat `op()` as a regular function [range(10), range(10, 20)] | transpose() | filt(op() > 7, 0) | deref() Performance-wise, there are some, but not a lot of degradation, so don't worry about it:: n = 10_000_000 # takes 1.6s for i in range(n): i**2 # takes 1.8s, 1.125x worse than for loop range(n) | apply(lambda x: x**2) | ignore() # takes 2.7s, 1.7x worse than for loop range(n) | apply(op()**2) | ignore() # takes 2.7s range(n) | (op()**2).all() | ignore() Reserved operations that are not absorbed are: - all - __ror__ (__or__ still works!) - op_solidify""" def __init__(self): super().__init__({"_op_solidified": False})
[docs] def op_solidify(self): """Use this to not absorb ``__call__`` operations anymore and makes it feel like a regular function (still absorbs other operations though):: f = op()**2 3 | f # returns 9, but may be you don't want to pipe it in f.op_solidify() f(3) # returns 9""" self._ab_sentinel = True self._op_solidified = True self._ab_sentinel = False return self
[docs] def __ror__(self, it): return self.ab_operate(it)
def __or__(self, o): if isinstance(o, BaseCli): return super(k1lib.Absorber, self).__or__(o) return super().__add__(o) def __add__(self, o): if isinstance(o, BaseCli): return super(k1lib.Absorber, self).__add__(o) return super().__add__(o) def __and__(self, o): if isinstance(o, BaseCli): return super(k1lib.Absorber, self).__and__(o) return super().__and__(o) def __call__(self, *args, **kwargs): if self._op_solidified: return self.ab_operate(*args, **kwargs) return super().__call__(*args, **kwargs)