In [1]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *

database_name = 'sociology-wos-all'
In [2]:
import string_grouper
import editdistance
In [3]:
# the final variable we are constructing
groups = {}

# tracks the last group-id assigned
new_gid = 0
In [7]:
try:
    strings = list(load_variable("%s.c.ysum" % database_name))
except VariableNotFound:
    print("You need to generate ysum before running this notebook.")
In [8]:
len(strings)
Out[8]:
117200
In [15]:
def isarticle(x):
    sp = x.split("|")
    if len(sp) < 2:
        return False
    
    try:
        int(sp[1])
        return True
    except ValueError:
        return False

strings = [x for x in strings if '[no title captured]' not in x]
articles = [x for x in strings if isarticle(x)]
books = [x for x in strings if not isarticle(x)]
In [16]:
articles[:10]
Out[16]:
['Andrews, F.|1991|measures personality,v1,p61',
 'Cummins, R.|2002|universality subject,p7',
 'Davern, M.|2007|j happiness stud,v8,p429',
 'Deneve, K.|1998|psychol bull,v124,p197',
 'Diener, E.|1996|j res pers,v30,p389',
 'Diener, E.|1996|psychol sci,v7,p181',
 'Diener, E.|2003|annu rev psychol,v54,p403',
 'Diener, E.|1984|psychol bull,v95,p542',
 'Diener, E.|1994|soc indic res,v31,p103',
 'Diener, E.|1999|psychol bull,v125,p276']
In [17]:
books[:10]
Out[17]:
['Andrews, F.|social indicators we',
 'Campbell, A.|quality am life perc',
 'Costa, P.|revised neo personal',
 'Diener, E.|well being fdn hedon',
 'Gurin, G.|am view their mental',
 'Headey, B.|understanding happin',
 'International, W.|pers wellb ind',
 'Thompson, B.|exploratory confirma',
 'Bellah, R.|habits heart individ',
 'Easterlin, R.|nations households e']
In [18]:
print("%s articles, %s books to group" % (len(articles), len(books)))
70834 articles, 45057 books to group

grouping books

In [19]:
# this cell may take quite a while to run.
# on Intel i7-9700F this runs in about a minute on 185k names.

books_grouped = string_grouper.match_strings(
    pd.Series(books), 
    number_of_processes=8, 
    min_similarity=0.7
)
In [20]:
books_grouped[(books_grouped.similarity<1-1e-8)].sort_values("similarity")
Out[20]:
left_side right_side similarity
32939 Shalin, D.|blackwell companion Hunt, S.|blackwell companion 0.700002
7523 Hunt, S.|blackwell companion Shalin, D.|blackwell companion 0.700002
17437 Reskin, B.|sex segregation work Baron, J.|sex segregation work 0.700004
11952 Baron, J.|sex segregation work Reskin, B.|sex segregation work 0.700004
21770 Hobsbawm, E.|nations natl since 1 Hobsbawm, E.|nations natl 1780 0.700015
... ... ... ...
21980 Lockwood, D.|blackcoated worker s Lockwood, D.|blackcoated worker 0.993794
21979 Lockwood, D.|blackcoated worker s Lockwood, D.|black coated worker 0.993794
11796 Lockwood, D.|blackcoated worker Lockwood, D.|blackcoated worker s 0.993794
1308 Popper, K.|conjectures refutati Popper, K.|conjectures refutat 0.994171
35797 Popper, K.|conjectures refutat Popper, K.|conjectures refutati 0.994171

11070 rows × 3 columns

In [21]:
# for books, we require that the authors are no more than 1 edit from each other
# even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F

ft = defaultdict(set)

for i,r in books_grouped.iterrows():
    ls = r.left_side
    rs = r.right_side
    
    if ls == rs:
        continue
    
    la = ls.split("|")[0]
    ra = rs.split("|")[0]
    
    if editdistance.eval(la,ra) > 1:
        continue
    
    ft[ls].add(rs)
    ft[rs].add(ls)
    
print("%s books have some connection to others in a group" % len(ft))
6941 books have some connection to others in a group
In [22]:
# assigns group-ids based on the relational structure derived thus far
# the code propagates ids through the network, assuming transitivity of equality

def traverse(x, gid):
    global groups
    groups[x] = gid
    
    neighbors = ft[x]
    for n in neighbors:
        if n not in groups:
            traverse(n, gid)
      
for i,k in enumerate(books):
    if k in groups:
        continue
        
    traverse(k, new_gid)
    new_gid += 1
In [23]:
len(set(groups.values()))
Out[23]:
41195
In [24]:
Counter(gid for x,gid in groups.items() if len(x.split("|"))==2).most_common(10)
Out[24]:
[(495, 21),
 (396, 19),
 (605, 11),
 (1386, 11),
 (1924, 10),
 (163, 9),
 (2219, 9),
 (4411, 9),
 (1997, 8),
 (672, 7)]

grouping articles

In [25]:
# this cell may take quite a while to run.
# on Intel i7-9700F this runs in five minutes on 234k entries.

articles_grouped = string_grouper.match_strings(
    pd.Series(articles), 
    number_of_processes=8, # decrease this number to 1 or 2 for slower computers or laptops (the fan might start screaming)
    min_similarity=0.8 # the similarity cutoff is tighter for articles than for books
)
In [26]:
articles_grouped[(articles_grouped.similarity<1-1e-8)].sort_values("similarity")
Out[26]:
left_side right_side similarity
10579 Chirkov, V.|2007|int j intercult rel,v31,p199 Chirkov, V.|2005|int j intercult rel,v29,p469 0.800020
21924 Chirkov, V.|2005|int j intercult rel,v29,p469 Chirkov, V.|2007|int j intercult rel,v31,p199 0.800020
48091 Immergluck, D.|2010|urban aff rev,v46,p3 Immergluck, D.|2005|urban aff rev,v40,p362 0.800080
48088 Immergluck, D.|2005|urban aff rev,v40,p362 Immergluck, D.|2010|urban aff rev,v46,p3 0.800080
18569 Stockdale, A.|2006|j rural stud,v22,p354 Stockdale, A.|2010|j rural stud,v26,p31 0.800103
... ... ... ...
48051 Vallin, J.|2004|special collection,v2,p11 Vallin, J.|2004|special collection,v2,p1 0.989373
36038 Biderman, A.|1967|ann am acad polit ss,v374,p1 Biderman, A.|1967|ann am acad polit ss,v374,p16 0.989439
56588 Biderman, A.|1967|ann am acad polit ss,v374,p16 Biderman, A.|1967|ann am acad polit ss,v374,p1 0.989439
44933 Portes, A.|1980|soc forces,v59,p200 Portes, A.|1980|soc forces,v59,p201 0.991548
27703 Portes, A.|1980|soc forces,v59,p201 Portes, A.|1980|soc forces,v59,p200 0.991548

4048 rows × 3 columns

In [27]:
# for articles, we require that the entire citations is only 1 edit apart.
# even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F

# this cell produces the `ft` variable, which maps from each term to the set of terms equivalent. I.e., `ft[A] = {B1,B2,B3}`

ft = defaultdict(set)

for i,r in articles_grouped.iterrows():
    ls = r.left_side
    rs = r.right_side
    
    if ls == rs:
        continue
    
    la = ls.split("|")[0]
    ra = rs.split("|")[0]
        
    if editdistance.eval(ls,rs) > 1:
        continue
    
    ft[ls].add(rs)
    ft[rs].add(ls)
    #print(ls,"|||",rs)

print("%s articles have some connection to others in a group" % len(ft))
564 articles have some connection to others in a group
In [28]:
# assigns group-ids based on the relational structure derived thus far
# the code propagates ids through the network, assuming transitivity of equality

def traverse(x, gid):
    global groups
    groups[x] = gid
    
    neighbors = ft[x]
    for n in neighbors:
        if n not in groups:
            traverse(n, gid)

for i,k in enumerate(articles):
    if k in groups:
        continue
        
    traverse(k, new_gid)
    new_gid += 1
In [29]:
# this line will break execution if there aren't as many groups assigned as we have articles and books
assert( len(articles) + len(books) == len(groups) )
In [30]:
len(set(groups.values()))
Out[30]:
111731
In [31]:
len(set(groups.values())) - len(articles)
Out[31]:
40897
In [32]:
len(set(groups.values())) - len(books) - len(articles)
Out[32]:
-4160
In [33]:
len(books)
Out[33]:
45057
In [34]:
len(articles)
Out[34]:
70834
In [35]:
# saving the variable for later
save_variable("%s.groups" % database_name, groups)

sanity checks

In [ ]:
g = load_variable("%s.groups" % database_name)
In [ ]:
len(g)
In [ ]:
Counter([g[x] for x in groups if len(x.split("|"))>2]).most_common(10)
In [ ]:
len(set(g.values())) - len(g)
In [ ]:
len(set(g.values()))
In [ ]:
cits = get_cnt('sociology-wos.ind', ['c','c.fy'])
In [ ]:
sum(cits['c.fy'].values())
In [ ]:
next(gi)
In [ ]:
mygrp
In [ ]:
 
In [ ]:
list(cits['c.fy'].items())[:5]
In [ ]:
list(g)[:5]
In [ ]:
g['bourdieu|distinction social c']
In [ ]:
import json

to_print = sorted( cits['c'].items(), key=lambda x:-x[1] )[:20]
to_print = [x[0] for x in to_print]
In [ ]:
[x for x in g if 'bourd' in x]
In [ ]:
list(cits['c.fy'])[:5]
In [ ]:
cits['c'][('bourdieu|logic practice',)]
In [ ]:
 
In [ ]:
to_print
In [ ]:
printed_i = 0
checking_j = 0


while printed_i < 5:
    mine = [k for k in g if g[k]==checking_j]
    myvals = [cits['c'][(k,)] for k in mine]
    if sum( myvals ) > 0:
        print("%s (%s)" % (k,x) for x in zip(mine,myvals))
        printed_i += 1
    
    checking_j += 1
In [ ]: