In [1]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
In [23]:
docs = get_cnt("sociology-wos.ind", keys=['c.fa'])
Loaded keys: dict_keys(['c.fa'])
Available keys: ['c', 'c.c', 'c.fj', 'c.fy', 'c.fy.j', 'fa', 'c.fa', 'fa.fj', 'fa.fj.fy', 'fa.fy', 'ffa', 'c.ffa', 'ffa.fj', 'ffa.fy', 'fj', 'fj.fy', 'fj.ta', 'fj.ty', 'fy', 'fy.ta', 'fy.ty', 'ta', 'ty', 'ty.ty']
In [25]:
len(docs['c.fa'])
Out[25]:
2238622
In [27]:
key = 'c.fa'
keysp = key.split(".")
In [28]:
names = sorted(set(comb.c for comb in docs['c.fa']))
namesi = {
    n:i
    for i,n in enumerate(names)
}
In [29]:
years = sorted(set(comb.fa for comb in docs['c.fa']))
yearsi = {
    y:i
    for i,y in enumerate(years)
}
In [30]:
len(years)
Out[30]:
111731
In [31]:
from scipy.sparse import csr_matrix
In [32]:
flatten = list(docs['c.fa'].items())
data = [x[1] for x in flatten]
row_ind = [ namesi[x[0].c] for x in flatten ]
col_ind = [ yearsi[x[0].fa] for x in flatten ]
In [33]:
len(data)
Out[33]:
2238622
In [34]:
spmat = csr_matrix((
    data,
    (row_ind, col_ind)
), shape = (len(names), len(years)))
In [35]:
save_variable("sociology-wos.doc - c.fa - sparse", ((names, years),spmat))
In [ ]: