Source code for k1lib.bioinfo.cli.structural

# AUTOGENERATED FILE! PLEASE DON'T EDIT
"""
This is for functions that sort of changes the table
structure in a dramatic way. They're the core transformations
"""
from typing import List, Union, Iterator, Callable, Any
from collections import defaultdict, Counter
from k1lib.bioinfo.cli.init import patchDefaultDelim, BaseCli, oneToMany
import k1lib.bioinfo.cli as cli
__all__ = ["joinColumns", "joinRows", "joinStreams",
           "splitColumns", "insertRow", "insertIdColumn",
           "toDict", "split", "count", "permute", "accumulate", "AA_",
           "infinite"]
[docs]class joinColumns(BaseCli):
[docs]    def __init__(self, delim:str=None, sep:bool=False):
        """Join multiple columns and loop through all rows

:param sep: if True, don't join row elements into a list, and keep them
    separate in a tuple
"""
        self.delim = patchDefaultDelim(delim); self.sep = sep
[docs]    def __ror__(self, it:Iterator[Iterator[str]]):
        if self.sep:
            for lineElems in zip(*it): yield lineElems
        else:
            for lineElems in zip(*it):
                yield self.delim.join(lineElems | cli.toStr())
[docs]class joinRows(BaseCli):
    """Join multiple stream of rows"""
[docs]    def __ror__(self, streams:Iterator[Iterator[Any]]) -> Iterator[Any]:
        for stream in streams: yield from stream
joinStreams = joinRows
[docs]class splitColumns(BaseCli):
[docs]    def __init__(self, delim:str=None):
        """Splits lines into multiple columns, and return the columns individually"""
        self.delim = patchDefaultDelim(delim)
        self.lists = defaultdict(lambda: [])
[docs]    def __ror__(self, it):
        for line in it:
            for i, elem in enumerate(line.split(self.delim)):
                self.lists[i].append(elem)
        return list(self.lists.values())
[docs]class insertRow(BaseCli):
[docs]    def __init__(self, *columns:Union[List[str], str], delim:str=None):
        """Inserts a row right before everything else"""
        if isinstance(columns, tuple) and len(columns) == 1 and isinstance(columns[0], (list, tuple)):
            columns = columns[0]
        self.columns = columns; self.delim = patchDefaultDelim(delim)
[docs]    def __ror__(self, it:Iterator[str]):
        yield self.delim.join(self.columns)
        for e in it: yield e
[docs]def insertIdColumn(begin=True, delim:str=None):
    """Inserts an id column at the beginning (or end)"""
    if begin: return (cli.toRange() & cli.identity()) | cli.joinColumns(delim)
    else: return (cli.identity() & cli.toRange()) | cli.joinColumns(delim)
[docs]class toDict(BaseCli):
[docs]    def __init__(self, keyF:Callable[[Any], str]=None, valueF:Callable[[Any], Any]=None):
        """Transform an incoming stream into a dict using a function for
values. Example::

    names = ["wanda", "vision", "loki", "mobius"]
    names | toDict(valueF=lambda s: len(s)) # will return {"wanda": 5, "vision": 6, ...}
    names | toDict(lambda s: s.title(), lambda s: len(s)) # will return {"Wanda": 5, "Vision": 6, ...}
"""
        self.keyF = keyF or (lambda s: s)
        self.valueF = valueF or (lambda s: s)
[docs]    def __ror__(self, keys:Iterator[str]):
        keyF = self.keyF; valueF = self.valueF
        return {keyF(key):valueF(key) for key in keys}
[docs]class split(BaseCli):
[docs]    def __init__(self, delim:str=None, idx:int=None):
        """Splits each line using a delimiter, and outputs the
parts as a separate line.

:param idx: if available, only outputs the element at that index"""
        self.delim = patchDefaultDelim(delim); self.idx = idx
[docs]    def __ror__(self, it:Iterator[str]):
        if self.idx == None:
            for line in it:
                for elem in line.split(self.delim): yield elem
        else:
            for line in it:
                elems = line.split(self.delim)
                yield elems[self.idx] if self.idx < len(elems) else None
[docs]class count(BaseCli):
[docs]    def __init__(self, delim:str=None):
        """Finds unique elements and returns a generator of "{value} {key}"."""
        self.delim = patchDefaultDelim(delim)
[docs]    def __ror__(self, it:Iterator[str]):
        c = Counter(it); s = sum(c.values())
        for k, v in c.items():
            yield f"{v}{self.delim}{k}{self.delim}{round(100*v/s)}%"
[docs]class permute(BaseCli):
[docs]    def __init__(self, permutations:List[int], delim:str=None):
        """Permutes the columns. Acts like torch.permute(...)"""
        self.permutations = permutations
        self.delim = patchDefaultDelim(delim)
[docs]    def __ror__(self, it:Iterator[str]):
        for line in it:
            elems = line.split(self.delim)
            yield self.delim.join([elems[i] for i in self.permutations])
[docs]class accumulate(BaseCli):
[docs]    def __init__(self, column:int=0, avg=False, delim:str=None):
        """Groups lines that have the same line.split(delim)[column], and
add together all other columns, assuming they're floats

Args:
    column: common column to accumulate
    avg: calculate average values instead of sum
    delim: specify delimiter between columns"""
        self.column = column; self.avg = avg
        self.delim = patchDefaultDelim(delim)
        self.dict = defaultdict(lambda: defaultdict(lambda: 0))
[docs]    def __ror__(self, it:Iterator[str]):
        for line in it:
            elems = line.split(self.delim); key = elems[self.column]
            elems.pop(self.column)
            for i, elem in enumerate(elems):
                try: self.dict[key][i] += float(elem)
                except: self.dict[key][i] = elem
        for key, values in self.dict.items():
            n = len(self.dict[key].keys())
            if self.avg:
                for i in range(n):
                    if isinstance(self.dict[key][i], float):
                        self.dict[key][i] /= n
            elems = [str(self.dict[key][i]) for i in range(n)]
            elems.insert(self.column, key)
            yield self.delim.join(elems)
[docs]class AA_(BaseCli):
[docs]    def __init__(self, *idxs:List[int], wraps=False):
        """Returns 2 streams, one that has the selected element, and the other
the rest. Example::

    [1, 5, 6, 3, 7] | AA_(1) # will return [5, [1, 6, 3, 7]]

You can also put multiple indexes through::

    [1, 5, 6] | AA_(0, 2) # will return [[1, [5, 6]], [6, [1, 5]]]

If you put None in, then all indexes will be sliced::

    [1, 5, 6] | AA_(0, 2)

    # will return:
    # [[1, [5, 6]],
    #  [5, [1, 6]],
    #  [6, [1, 5]]]

As for why the strange name, think of this operation as "AĀ". In statistics,
say you have a set "A", then "not A" is commonly written as A with an overline
"Ā". So "AA\_" represents "AĀ", and that it first returns the selection A
first.

:param wraps: if True, then the first example will return [[5, [1, 6, 3, 7]]]
    instead, so that A has the same signature as Ā
"""
        self.idxs = idxs; self.wraps = wraps
[docs]    def __ror__(self, it:List[Any]) -> List[List[List[Any]]]:
        idxs = self.idxs; it = list(it)
        if len(idxs) == 1 and idxs[0] is None: idxs = range(len(it))
        def gen(idx):
            return [it[idx], [v for i, v in enumerate(it) if i != idx]]
        if not self.wraps and len(idxs) == 1: return gen(idxs[0])
        return [gen(idx) for idx in idxs]
[docs]class infinite(BaseCli):
    """Takes in a stream and yields an infinite amount of them. Example:

.. code-block::

    # returns [[1, 2, 3], [1, 2, 3], [1, 2, 3]]
    [1, 2, 3] | infinite() | head(3) | toList()
"""
[docs]    def __ror__(self, it:Iterator[Any]) -> Iterator[Iterator[Any]]:
        it = list(it)
        while True: yield it