from k1lib.imports import *
import unicodedata, string


namesFolder = "cli_name_languages/names"
nameFiles = glob.glob(f"{namesFolder}/*.txt")
withBareNames = insertColumn(*(nameFiles | op().split("/")[-1].all() | op().split(".")[0].all())) | display(None)
nameFiles[:3], len(nameFiles)

(['cli_name_languages/names/Korean.txt',
  'cli_name_languages/names/Spanish.txt',
  'cli_name_languages/names/Greek.txt'],
 18)


cat(nameFiles[0]) | headOut(3)

Ahn
Baik
Bang


nameFiles[0] | cat() | headOut(3)

Ahn
Baik
Bang


letters = string.ascii_letters + ".,;'"
def unicodeToAscii(s, notIn=False):
    if notIn: # debug case
        return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn" and c not in letters)
    else: # "right" case
        return "".join(c for c in unicodedata.normalize("NFD", s) if unicodedata.category(c) != "Mn" and c in letters)


nameFiles | cat().all() | joinStreams() | shape(0)

20074


def unicodes(): return nameFiles | cat().all() | joinStreams() | apply(partial(unicodeToAscii, notIn=True))
unicodes() | count() | display(None)

19962          99%   
47             0%    
3              0%    
21      -      0%    
2       --     0%    
1              0%    
23             0%    
1       /      0%    
3       1      0%    
9       ß      0%    
1       ł      0%    
1       :      0%


unicodes() | op().strip().all() | filt(op() != "") | count() | display()

21   -    55%   
2    --   5%    
1    /    3%    
3    1    8%    
9    ß    24%   
1    ł    3%    
1    :    3%


nameFiles | cat().all() | (count() | filt(op() != "1", 0) | shape(0)).all() | unsqueeze(1) | withBareNames

Korean       94     
Spanish      296    
Greek        193    
Irish        226    
Scottish     100    
Portuguese   74     
Russian      9342   
Czech        503    
French       273    
German       706    
Japanese     990    
Polish       138    
Arabic       108    
English      3668   
Chinese      246    
Dutch        286    
Italian      701    
Vietnamese   71


nameFiles | cat().all() | toSet().all() | joinStreams() | (identity() & toSet()) | shape(0).all() | deref()

[18015, 17458]


nameFiles | AA_(0) | ((cat() | toList() | repeat()) + cat().all()) | transpose() | intersection().all()\
| insertColumn(*list(nameFiles | op().split("/")[-1].all() | op().split(".")[0].all())[1:]) | display(None)

Spanish                                                                                                                                   
Greek                                                                                                                                     
Irish                                                                                                                                     
Scottish                                                                                                                                  
Portuguese                                                                                                                                
Russian      Li     Han                                                                                                                   
Czech                                                                                                                                     
French                                                                                                                                    
German       Wang                                                                                                                         
Japanese     Ko     Jo     Seo                                                                                                            
Polish                                                                                                                                    
Arabic                                                                                                                                    
English      Yang   Lee    Wang   Chong   Moon   Chung                                                                                    
Chinese      Kang   Yang   Chou   Chi     Chin   Han     Yim     Wang   Sun   You   Woo   Yun   Chu   Chong   Hong   Song   Chang   Koo   
Dutch                                                                                                                                     
Italian                                                                                                                                   
Vietnamese   Han    Kim    Chu    Ho      Ha     Ma      Chung


analyze2Files = intersection() | shape(0) # takes 2 files, and squish them into 1 value
analyze1Combo = ((cat() | toList() | repeat()) + cat().all()) | transpose() | analyze2Files.all() | toSum() # summing all common values
nameFiles | AA_() | analyze1Combo.all() | unsqueeze(1) | withBareNames

Korean       37    
Spanish      104   
Greek        1     
Irish        78    
Scottish     115   
Portuguese   57    
Russian      74    
Czech        41    
French       102   
German       148   
Japanese     9     
Polish       24    
Arabic       5     
English      381   
Chinese      52    
Dutch        58    
Italian      54    
Vietnamese   20


%%time
range(400) | repeatFrom() | apply(lambda x: x+2) | batched(1000) | toSum().all() | ~head(10000) | headOut()

181500
221500
181500
221500
181500
221500
181500
221500
181500
221500
CPU times: user 1.19 s, sys: 0 ns, total: 1.19 s
Wall time: 1.19 s


%%time
range(int(1e6)) | applyS(lambda x: x / 2).all() | ignore()

CPU times: user 72.9 ms, sys: 0 ns, total: 72.9 ms
Wall time: 71.7 ms


%%time
range(int(1e6)) | apply(lambda x: x / 2) | ignore()

CPU times: user 72.3 ms, sys: 0 ns, total: 72.3 ms
Wall time: 71.3 ms


%%time
range(int(1e6)) | apply(applyS(lambda x: x / 2)) | ignore()

CPU times: user 73.4 ms, sys: 0 ns, total: 73.4 ms
Wall time: 72.3 ms

k1lib.bioinfo.cli module¶

Speed analysis¶

For loops with yields vs .all()¶