# -*- coding: utf-8 -*-
import sys
import nltk
import json
# Load in output from blogs_and_nlp__get_feed.py
BLOG_DATA = sys.argv[1]
blog_data = json.loads(open(BLOG_DATA).read())
for post in blog_data:
sentences = nltk.tokenize.sent_tokenize(post['content'])
tokens = [nltk.tokenize.word_tokenize(s) for s in sentences]
pos_tagged_tokens = [nltk.pos_tag(t) for t in tokens]
# Flatten the list since we're not using sentence structure
# and sentences are guaranteed to be separated by a special
# POS tuple such as ('.', '.')
pos_tagged_tokens = [token for sent in pos_tagged_tokens for token in sent]
all_entity_chunks = []
previous_pos = None
current_entity_chunk = []
for (token, pos) in pos_tagged_tokens:
if pos == previous_pos and pos.startswith('NN'):
current_entity_chunk.append(token)
elif pos.startswith('NN'):
if current_entity_chunk != []:
# Note that current_entity_chunk could be a duplicate when appended,
# so frequency analysis again becomes a consideration
all_entity_chunks.append((' '.join(current_entity_chunk), pos))
current_entity_chunk = [token]
previous_pos = pos
# Store the chunks as an index for the document
# and account for frequency while we're at it...
post['entities'] = {}
for c in all_entity_chunks:
post['entities'][c] = post['entities'].get(c, 0) + 1
# For example, we could display just the title-cased entities
print post['title']
print '-' * len(post['title'])
proper_nouns = []
for (entity, pos) in post['entities']:
if entity.istitle():
print '\t%s (%s)' % (entity, post['entities'][(entity, pos)])
print