import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
showdocs("counter")
First, you need to get some data. In accordance with JSTOR's usage policies, I do not provide any full-text data. And that's the data you need to use this notebook. You can obtain your own data by requesting full OCR data packages through JSTOR's Data for Research initiative.
Make sure to read carefully through "User Settings." Set the appropriate settings, and run the entire notebook.
This will create a new "database" of counts, which can be recalled by running my_counts = get_cnt( '<DB_NAME_HERE>' )
.
database_name
is the name you choose for the final dataset of counts
zipdir
is the directory which contains the .zip
files JSTOR provides to you (not included)
mode
choose between "basic" and "all" mode
"basic" mode
everything
, but it does reduce RAM overheadc
counts, the number of citations each document receivesc.fj
counts, the number of citations each document receives from each journal's articlesc.fy
counts, the number of citations each document receives from each year's articlesfj
counts, the number of citations from each journalfj.fy
counts, the number of citations in each journal in each yeart
fy.t
counts, for term time series and filtering"all" mode
basic
modec.t
fj.t
)a
a.c
a.j.y
)c.c
, the cooccurrence network between citationsdatabase_name = 'sociology-jstor-basicall'
zipdir = 'G:/My Drive/projects/qualitative analysis of literature/pre 5-12-2020/003 process JSTOR output/RaW dAtA/'
mode = 'all'
I use citation and journal filters while counting.
This filtering is important when working with large datasets. You can run the "trend summaries/cysum" on a basic
database, and use the variable it automatically generates, "<DBNAME>.included_citations"
to modify which citations to use when computing the all
database.
In most cases, it's best to set use_included_citations_filter
and use_included_journals_filter
both to False
the first time you run this notebook on a new dataset.
use_included_citations_filter = True
use_included_journals_filter = True
# not necessary if you're not filtering based on citations and journals pre-count
included_citations = load_variable("sociology-jstor.included_citations")
included_journals = ['Acta Sociologica', 'Administrative Science Quarterly', 'American Journal of Political Science', 'American Journal of Sociology', 'American Sociological Review', 'Annual Review of Sociology', 'BMS: Bulletin of Sociological Methodology / Bulletin de Méthodologie Sociologique', 'Berkeley Journal of Sociology', 'Contemporary Sociology', 'European Sociological Review', 'Hitotsubashi Journal of Social Studies', 'Humboldt Journal of Social Relations', 'International Journal of Sociology', 'International Journal of Sociology of the Family', 'International Review of Modern Sociology', 'Journal for the Scientific Study of Religion', 'Journal of Health and Social Behavior', 'Journal of Marriage and Family', 'Language in Society', 'Michigan Sociological Review', 'Polish Sociological Review', 'Review of Religious Research', 'Social Forces', 'Social Indicators Research', 'Social Problems', 'Social Psychology Quarterly', 'Sociological Bulletin', 'Sociological Focus', 'Sociological Forum', 'Sociological Methodology', 'Sociological Perspectives', 'Sociological Theory', 'Sociology', 'Sociology of Education', 'Sociology of Religion', 'Symbolic Interaction', 'The American Sociologist', 'The British Journal of Sociology', 'The Canadian Journal of Sociology', 'The Sociological Quarterly', 'Theory and Society']
Terms are iteratively pruned. After CONSOLIDATE_EVERY_N_CITS
citations are counted, the algorithm will keep only the top NUM_TERMS_TO_KEEP
terms, blacklisting the rest and not counting them anymore. This doesn't hurt the dataset, but dramatically reduces the RAM overhead and the size of the final dataset on disk.
CONSOLIDATE_TERMS = True
NUM_TERMS_TO_KEEP = 5000
CONSOLIDATE_EVERY_N_CITS = NUM_TERMS_TO_KEEP*3
#CONSOLIDATE_EVERY_N_CITS = 1000
NPERYEAR = 300
It's also convenient to be able to rename various entities. There were a few different names for the Canadian Journal of Sociology. If you want to filter on something other than journals, you'll have to modify the code and add this feature.
journal_map = {} # default
journal_map = {
"Canadian Journal of Sociology / Cahiers canadiens de sociologie": 'The Canadian Journal of Sociology',
"The Canadian Journal of Sociology / Cahiers canadiens de\n sociologie": 'The Canadian Journal of Sociology',
'The Canadian Journal of Sociology / Cahiers canadiens de sociologie': 'The Canadian Journal of Sociology'
}
# utilities
from nltk import sent_tokenize
from zipfile import ZipFile
import os
import sys
sys.path.insert(0, os.path.abspath('./creating variables/'))
# library functions for cleaning and extracting in-text citations from OCR
from cnt_cooc_jstor_lib import (
citation_iterator, getOuterParens,
Document, ParseError,
clean_metadata
)
# XML parser
from lxml.etree import _ElementTree as ElementTree
from lxml import etree
recovering_parser = etree.XMLParser(recover=True)
# getting ready for term counting
from nltk.corpus import stopwords as sw
stopwords = set(sw.words('english'))
zipfiles = list(Path(zipdir).glob("*.zip"))
The following helper function file_iterator
iterates through all documents inside a list of zipfiles
Each iteration returns:
def getname(x):
x = x.split("/")[-1]
x = re.sub(r'(\.xml|\.txt)','',x)
return x
def file_iterator(zipfiles):
from random import shuffle
all_files = []
for zf in zipfiles:
archive = ZipFile(zf, 'r')
files = archive.namelist()
names = list(set(getname(x) for x in files))
all_files += [(archive,name) for name in names]
shuffle(all_files)
for archive, name in all_files:
try:
yield(
name.split("-")[-1].replace("_", "/"),
archive.read("metadata/%s.xml" % name),
archive.read("ocr/%s.txt" % name).decode('utf8')
)
except KeyError: # some very few articles don't have both
continue
get_page_strings
takes the string contents of an XML file produced by JSTOR. The XML file in question represents the text of a given article. This function cleans the text for OCR peculiarities, and splits the document into pages for further processing.
def basic_ocr_cleaning(x):
# remove multiple spaces in a row
x = re.sub(r" +", ' ', str(x))
# remove hyphenations [NOTE this should be updated, with respect to header and footer across pages...]
x = re.sub(r"([A-Za-z]+)-\s+([A-Za-z]+)", "\g<1>\g<2>", x)
x = x.strip()
return x
def get_content_string(ocr_string):
docXml = etree.fromstring(ocr_string, parser=recovering_parser)
pages = docXml.findall(".//page")
page_strings = []
for p in pages:
if p.text is None:
continue
page_strings.append(p.text)
secs = docXml.findall(".//sec")
for s in secs:
if s.text is None:
continue
if s.text.strip() == '':
try_another = etree.tostring(s, encoding='utf8', method='text').decode("utf8").strip()
#print(try_another)
if try_another == '':
continue
page_strings.append(try_another)
else:
page_strings.append(s.text.strip())
return basic_ocr_cleaning( "\n\n".join(page_strings) )
consolidate terms
was built to eliminate all terms which are not in the top NUM_TERMS_TO_KEEP
.
This is done by sorting fromyear-term
, or fy.t
counts in descending order. The top entry here is the term-year pair which accumulated the most appearances in citation contexts. I take the top 1000 t
's in this sorted list and preserve them, and blacklist the rest.
term_whitelist = set()
def consolidate_terms():
global term_whitelist, CONSOLIDATION_CUTOFF
have_now = set(cnt_doc['t'])
# this is where the filtering occurs
to_keep = set()
if True:
# takes terms based on the maximum number I can take...
terms = list(cnt_doc['t'].keys())
counts = np.array([cnt_doc['t'][k] for k in terms])
argst = list(reversed(np.argsort(counts)))
to_keep = [terms[i] for i in argst if '-' in terms[i][0]][:NUM_TERMS_TO_KEEP//2] # half should be 2-tuples
to_keep += [terms[i] for i in argst if not '-' in terms[i][0]][:NUM_TERMS_TO_KEEP//2] # half should be 1-tuples
to_remove = have_now.difference(to_keep)
to_remove = set("-".join(x) for x in to_remove)
if False:
# takes the top 5000 terms in terms of yearly count
sort_them = sorted(cnt_doc['fy.t'], key=lambda x: -cnt_doc['fy.t'][x])
to_keep = defaultdict(set)
i = 0
while not len(to_keep) or (
min(len(x) for x in to_keep.values()) < NPERYEAR and
i < len(sort_them)
):
# adds the term to the year set, if it's not already "full"
me = sort_them[i]
me_fy, me_t = me
# eventually, we don't count it :P
if cnt_doc['t'][me_t] < CONSOLIDATION_CUTOFF:
break
if len(to_keep[me_fy]) < NPERYEAR:
to_keep[me_fy].add(me_t)
i += 1
if False: # useful for debugging
print({
k: len(v)
for k,v in to_keep.items()
})
to_keep = set(chain.from_iterable(x for x in to_keep.values()))
to_remove = have_now.difference(to_keep)
# so that we never log counts for these again:
term_whitelist.update([x[0] for x in to_keep])
# the rest of the code is pruning all other term counts for this term in memory
print("consolidating... removing", len(to_remove), 'e.g.', sample(to_remove,5))
to_prune = ['t','fy.t','fj.t','c.t']
for tp in to_prune:
whichT = tp.split(".").index('t') # this checks where 't' is in the name of the variable (first or second?)
print("pruning '%s'..." % tp)
tydels = [x for x in cnt_doc[tp] if x[ whichT ] in to_remove]
print("old size:", len(cnt_doc[tp]))
for tr in tydels:
del cnt_doc[tp][tr]
del cnt_ind[tp][tr]
print("new size:", len(cnt_doc[tp]))
print("final terms: ", ", ".join( sample(list("-".join(list(x)) for x in cnt_doc['t']), 200) ))
The following cells contain the counting function, which accounts for a document in various ways. This function should be relatively simple to extend, if you want to count other combinations, or different attributes altogether.
cnt_ind = defaultdict(lambda:defaultdict(int))
track_doc = defaultdict(lambda:defaultdict(set))
cnt_doc = defaultdict(lambda:defaultdict(int))
def cnt(term, space, doc):
# it's a set, yo
track_doc[space][term].add(doc)
# update cnt_doc
cnt_doc[space][term] = len(track_doc[space][term])
# update ind count
cnt_ind[space][term] += 1
cits = 0
last_print = 0
citations_skipped = 0
def account_for(doc):
global cits, last_print, mode, citations_skipped
# consolidating "terms" counter as I go, to limit RAM overhead
# I'm only interested in the most common 1000
if CONSOLIDATE_TERMS and \
not len(term_whitelist) and \
cits - last_print > CONSOLIDATE_EVERY_N_CITS:
print("Citation %s" % cits)
print("Term %s" % len(cnt_doc['t']))
#print(sample(list(cnt_doc['t']), 10))
last_print = cits
consolidate_terms()
if 'citations' not in doc or not len(doc['citations']):
#print("No citations", doc['doi'])
return
for c in doc['citations']:
if 'contextPure' not in c:
raise Exception("no contextPure...")
for cited in c['citations']:
if use_included_citations_filter and (cited not in included_citations):
citations_skipped += 1
continue
cits += 1
cnt(doc['year'], 'fy', doc['doi'])
# citation
cnt(cited, 'c', doc['doi'])
# journal
cnt(doc['journal'], 'fj', doc['doi'])
# journal year
cnt((doc['journal'], doc['year']), 'fj.fy', doc['doi'])
# citation journal
cnt((cited, doc['journal']), 'c.fj', doc['doi'])
# citation year
cnt((cited, doc['year']), 'c.fy', doc['doi'])
# constructing the tuples set :)
sp = c['contextPure'].lower()
sp = re.sub("[^a-zA-Z\s]+", "", sp) # removing extraneous characters
sp = re.sub("\s+", " ", sp) # removing extra characters
sp = sp.strip()
sp = sp.split() # splitting into words
sp = [x for x in sp if x not in stopwords] # strip stopwords
if False:
tups = set(zip(sp[:-1], sp[1:])) # two-word tuples
elif False:
tups = set( (t1,t2) for t1 in sp for t2 in sp if t1!=t2 )# every two-word pair :)
else:
tups = set( "-".join(sorted(x)) for x in set(zip(sp[:-1], sp[1:]))) # two-word tuples
tups.update( sp ) # one-word tuples
#print(len(tups),c['contextPure'], "---", tups)
if len(term_whitelist):
tups = [x for x in tups if x in term_whitelist]
# just term count, in case we are using the `basic` mode
for t1 in tups:
# term
cnt((t1,), 't', doc['doi'])
# term year
cnt((doc['year'], t1), 'fy.t', doc['doi'])
if mode == 'all':
for cited in c['citations']:
if use_included_citations_filter and (cited not in included_citations):
continue
# term features
for t1 in tups:
# cited work, tuple
cnt((cited, t1), 'c.t', doc['doi'])
# term journal
cnt((doc['journal'], t1), 'fj.t', doc['doi'])
if False: # eliminating data I'm not using
# author loop
for a in doc['authors']:
# term author
cnt((a, t1), 'fa.t', doc['doi'])
if len(term_whitelist): # really don't want to do this too early. wait until it's narrowed down to the 5k
# term term...
for t2 in tups:
# if they intersect each other, continue...
if len(set(t1).intersection(set(t2))) >= min(len(t1),len(t2)):
continue
# term term
cnt((t1,t2), 't.t', doc['doi'])
# author loop
for a in doc['authors']:
# citation author
cnt((cited,a), 'c.fa', doc['doi'])
# year author journal
cnt((a, doc['journal'], doc['year']), 'fa.fj.fy', doc['doi'])
# author
cnt((a,), 'fa', doc['doi'])
# add to counters for citation-citation counts
for cited1 in c['citations']:
for cited2 in c['citations']:
if cited1 >= cited2:
continue
cnt(( cited1, cited2 ), 'c.c', doc['doi'])
cnt(( cited1, cited2, doc['year'] ), 'c.c.fy', doc['doi'])
This cell is long-running
seen = set()
skipped = 0
total_count = Counter()
doc_count = Counter()
pair_count = Counter()
debug = False
for i, (doi, metadata_str, ocr_str) in enumerate( file_iterator(zipfiles) ):
if i % 1000 == 0:
print("Document", i, "...",
len(cnt_doc['fj'].keys()), "journals...",
len(cnt_doc['c'].keys()), "cited works...",
len(cnt_doc['fa'].keys()), "authors...",
len(cnt_doc['t'].keys()), "terms used...",
citations_skipped, "skipped citations...",
cnt_doc['t'][('social',)], "'social' terms"
)
try:
drep = clean_metadata( doi, metadata_str )
# sometimes multiple journal names map onto the same journal, for all intents and purposes
if drep['journal'] in journal_map:
drep['journal'] = journal_map[drep['journal']]
# only include journals in the list "included_journals"
if use_included_journals_filter and (drep['journal'] not in included_journals):
continue
if debug: print("got meta")
if drep['type'] != 'research-article':
continue
# some types of titles should be immediately ignored
def title_looks_researchy(lt):
lt = lt.lower()
lt = lt.strip()
for x in ["book review", 'review essay', 'back matter', 'front matter', 'notes for contributors', 'publication received', 'errata:', 'erratum:']:
if x in lt:
return False
for x in ["commentary and debate", 'erratum', '']:
if x == lt:
return False
return True
lt = drep['title'].lower()
if not title_looks_researchy(lt):
continue
# Don't process the document if there are no authors
if not len(drep['authors']):
continue
drep['content'] = get_content_string(ocr_str)
drep['citations'] = []
# loop through the matching parentheses in the document
for index, (parenStart, parenContents) in enumerate(getOuterParens(drep['content'])):
citations = list(citation_iterator(parenContents))
if not len(citations):
continue
citation = {
"citations": citations,
"contextLeft": drep['content'][parenStart-400+1:parenStart+1],
"contextRight": drep['content'][parenStart + len(parenContents) + 1:parenStart + len(parenContents) + 1 + 100],
"where": parenStart
}
# cut off any stuff before the first space
first_break_left = re.search(r"[\s\.!\?]+", citation['contextLeft'])
if first_break_left is not None:
clean_start_left = citation['contextLeft'][first_break_left.end():]
else:
clean_start_left = citation['contextLeft']
# cut off any stuff after the last space
last_break_right = list(re.finditer(r"[\s\.!\?]+", citation['contextRight']))
if len(last_break_right):
clean_end_right = citation['contextRight'][:last_break_right[-1].start()]
else:
clean_end_right = citation['contextRight']
# we don't want anything more than a sentence
sentence_left = sent_tokenize(clean_start_left)
if len(sentence_left):
sentence_left = sentence_left[-1]
else:
sentence_left = ""
sentence_right = sent_tokenize(clean_end_right)[0]
if len(sentence_right):
sentence_right = sentence_right[0]
else:
sentence_right = ""
# finally, strip the parentheses from the string
sentence_left = sentence_left[:-1]
sentence_right = sentence_right[1:]
# add the thing in context
full = sentence_left + "<CITATION>" + sentence_right
citation['contextPure'] = sentence_left
#print(full)
drep['citations'].append(citation)
# now that we have all the information we need,
# we simply need to "count" this document in a few different ways
account_for(drep)
except ParseError as e:
print("parse error...", e.args, doi)
list(cnt_doc['t'])[:5]
len([x for x in cnt_doc['t'] if not '-' in x[0]])
min(list(cnt_doc['t'].values()))
for k,v in cnt_doc.items():
print(k, len(v))
save_cnt("%s.doc"%database_name, cnt_doc)
save_cnt("%s.ind"%database_name, cnt_ind)
comments()