In [1]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *

User Settings

In [2]:
to_remove = ['Canadian Journal of Sociology / Cahiers canadiens de sociologie',
 'Contagion',
 'Contagion: Journal of Violence, Mimesis, and Culture',
 'Contexts',
 'Journal of Applied Social Science',
 'Max Weber Studies',
 'Race, Poverty & the Environment',
 'Social Thought & Research',
 'The Canadian Journal of Sociology / Cahiers canadiens de\n                sociologie']
remove_type = 'fj'
database_name = "sociology-jstor"

Consolidating

In [3]:
cnt_name = "%s.doc" % database_name
cnt_doc = get_cnt( cnt_name, keys=get_cnt_keys(cnt_name) )
Loaded keys: dict_keys(['fy', 'c', 'fj', 'fj.fy', 'c.fj', 'c.fy', 'a.c', 'a.fj.fy', 'a', 'c.c', 'c.t', 't', 'fy.t', 'fj.t'])
Available keys: ['fy', 'c', 'fj', 'fj.fy', 'c.fj', 'c.fy', 'a.c', 'a.fj.fy', 'a', 'c.c', 'c.t', 't', 'fy.t', 'fj.t']
In [4]:
cnt_name = "%s.ind" % database_name
cnt_ind = get_cnt( cnt_name, keys=get_cnt_keys(cnt_name) )
Loaded keys: dict_keys(['fy', 'c', 'fj', 'fj.fy', 'c.fj', 'c.fy', 'a.c', 'a.fj.fy', 'a', 'c.c', 'c.t', 't', 'fy.t', 'fj.t'])
Available keys: ['fy', 'c', 'fj', 'fj.fy', 'c.fj', 'c.fy', 'a.c', 'a.fj.fy', 'a', 'c.c', 'c.t', 't', 'fy.t', 'fj.t']
In [5]:
print("Consolidating %s of type `%s`."%(len(to_remove), remove_type))
print("Here are some examples: %s."%( list(to_remove)[:2] ))
Consolidating 9 of type `fj`.
Here are some examples: ['Canadian Journal of Sociology / Cahiers canadiens de sociologie', 'Contagion'].
In [6]:
to_prune = [x for x in get_cnt_keys(cnt_name) if remove_type in x.split(".")]
to_prune
Out[6]:
['fj', 'fj.fy', 'c.fj', 'a.fj.fy', 'fj.t']
In [7]:
for tp in to_prune:

    whichT = tp.split(".").index( remove_type ) # this checks where 't' is in the name of the variable (first or second?)

    print("pruning '%s'..." % tp)

    if tp == remove_type:
        tydels = to_remove
    else:
        tydels = [x for x in cnt_doc[tp] if x[ whichT ] in to_remove]

    print("old size:", len(cnt_doc[tp]))
    for tr in tydels:
        del cnt_doc[tp][tr]
        del cnt_ind[tp][tr]
    print("new size:", len(cnt_doc[tp]))
pruning 'fj'...
old size: 50
new size: 41
pruning 'fj.fy'...
old size: 1980
new size: 1888
pruning 'c.fj'...
old size: 790187
new size: 778966
pruning 'a.fj.fy'...
old size: 69032
new size: 68019
pruning 'fj.t'...
old size: 104876
new size: 103871
In [ ]: