[] |
pandas
requests
. , API .info
(Data Frame) Pandas.#
import pandas as pd
import requests
#
category = 'Ravenclaws'
url = 'http://harrypotter.wikia.com/api/v1/Articles/List?expand=1&limit=1000&category=' + category
requested_url = requests.get(url)
json_results = requested_url.json()
info = json_results['items']
ravenclaw_df = pd.DataFrame(info)
print('Number of articles: {}'.format(len(info)))
print('')
ravenclaw_df.head()
ravenclaw_df
. , API ID .#
houses = ['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin']
mydf = pd.DataFrame()
# ID , URL
for house in houses:
url = "http://harrypotter.wikia.com/api/v1/Articles/List?expand=1&limit=1000&category=" + house + 's'
requested_url = requests.get(url)
json_results = requested_url.json()
info = json_results['items']
house_df = pd.DataFrame(info)
house_df = house_df[house_df['type'] == 'article']
house_df.reset_index(drop=True, inplace=True)
house_df.drop(['abstract', 'comments', 'ns', 'original_dimensions', 'revision', 'thumbnail', 'type'], axis=1, inplace=True)
house_df['house'] = pd.Series([house]*len(house_df))
mydf = pd.concat([mydf, house_df])
mydf.reset_index(drop=True, inplace=True)
#
print('Number of student articles: {}'.format(len(mydf)))
print('')
print(mydf.head())
print('')
print(mydf.tail())
# " "
# - ,
#
text_dict = {}
for iden in mydf['id']:
url = 'http://harrypotter.wikia.com/api/v1/Articles/AsSimpleJson?id=' + str(iden)
requested_url = requests.get(url)
json_results = requested_url.json()
sections = json_results['sections']
contents = [sections[i]['content'] for i, x in enumerate(sections) if sections[i]['title'] == 'Personality and traits']
if contents:
paragraphs = contents[0]
texts = [paragraphs[i]['text'] for i, x in enumerate(paragraphs)]
all_text = ' '.join(texts)
else:
all_text = ''
text_dict[iden] = all_text
# DataFrame " "
text_df = pd.DataFrame.from_dict(text_dict, orient='index')
text_df.reset_index(inplace=True)
text_df.columns = ['id', 'text']
text_df['text_len'] = text_df['text'].map(lambda x: len(x))
#
mydf_all = pd.merge(mydf, text_df, on='id')
mydf_all.sort_values('text_len', ascending=False, inplace=True)
# DataFrame , " "
mydf_relevant = mydf_all[mydf_all['text_len'] > 0]
print('Number of useable articles: {}'.format(len(mydf_relevant)))
print('')
mydf_relevant.head()
trait_dict = {}
trait_dict['Gryffindor'] = ['bravery', 'nerve', 'chivalry', 'daring', 'courage']
trait_dict['Slytherin'] = ['resourcefulness', 'cunning', 'ambition', 'determination', 'self-preservation', 'fraternity',
'cleverness']
trait_dict['Ravenclaw'] = ['intelligence', 'wit', 'wisdom', 'creativity', 'originality', 'individuality', 'acceptance']
trait_dict['Hufflepuff'] = ['dedication', 'diligence', 'fairness', 'patience', 'kindness', 'tolerance', 'persistence',
'loyalty']
, , , , , , .
, ; ; , , , , -- .
(When he was younger, Neville was clumsy, forgetful, shy, and many considered him ill-suited for Gryffindor house because he seemed timid.
With the support of his friends, to whom he was very loyal, the encouragement of Professor Remus Lupin to face his fears in his third year, and the motivation of knowing his parents torturers were on the loose, Neville became braver, more self-assured, and dedicated to the fight against Lord Voldemort and his Death Eaters.)
synsets
WordNet, , nltk (NLTK Natural Language Toolkit). Synset synonym set, , . synsets
, .from nltk.corpus import wordnet as wn
#
foo1 = wn.synsets('bravery')
print("Synonym sets associated with the word 'bravery': {}".format(foo1))
foo2 = wn.synsets('fairness')
print('')
print("Synonym sets associated with the word 'fairness': {}".format(foo2))
foo3 = wn.synsets('wit')
print('')
print("Synonym sets associated with the word 'wit': {}".format(foo3))
foo4 = wn.synsets('cunning')
print('')
print("Synonym sets associated with the word 'cunning': {}".format(foo4))
foo4 = wn.synsets('cunning', pos=wn.NOUN)
print('')
print("Synonym sets associated with the *noun* 'cunning': {}".format(foo4))
print('')
# (""), synset
foo_list = [foo1, foo2, foo3, foo4]
for foo in foo_list:
for synset in foo:
print((synset.name(), synset.lemma_names()))
wn.synsets('bravery')
: courage.n.01
fearlessness.n.01
. , :crafty.s.01
clever.s.03
(). , cunning , . , wn.synsets('cunning', pos=wn.NOUN)
.synset
. , 'fairness' paleness.n.02
( ) comeliness.n.01
( ). ( ), .# (), "bravery"
foo1 = wn.synsets('bravery')
for synset in foo1:
for lemma in synset.lemmas():
print("Synset: {}; Lemma: {}; Antonyms: {}; Word Forms: {}".format(synset.name(), lemma.name(), lemma.antonyms(),
lemma.derivationally_related_forms()))
print("")
# ,
relevant_synsets = {}
relevant_synsets['Ravenclaw'] = [wn.synset('intelligence.n.01'), wn.synset('wit.n.01'), wn.synset('brain.n.02'),
wn.synset('wisdom.n.01'), wn.synset('wisdom.n.02'), wn.synset('wisdom.n.03'),
wn.synset('wisdom.n.04'), wn.synset('creativity.n.01'), wn.synset('originality.n.01'),
wn.synset('originality.n.02'), wn.synset('individuality.n.01'), wn.synset('credence.n.01'),
wn.synset('acceptance.n.03')]
relevant_synsets['Hufflepuff'] = [wn.synset('dedication.n.01'), wn.synset('commitment.n.04'), wn.synset('commitment.n.02'),
wn.synset('diligence.n.01'), wn.synset('diligence.n.02'), wn.synset('application.n.06'),
wn.synset('fairness.n.01'), wn.synset('fairness.n.01'), wn.synset('patience.n.01'),
wn.synset('kindness.n.01'), wn.synset('forgivingness.n.01'), wn.synset('kindness.n.03'),
wn.synset('tolerance.n.03'), wn.synset('tolerance.n.04'), wn.synset('doggedness.n.01'),
wn.synset('loyalty.n.01'), wn.synset('loyalty.n.02')]
relevant_synsets['Gryffindor'] = [wn.synset('courage.n.01'), wn.synset('fearlessness.n.01'), wn.synset('heart.n.03'),
wn.synset('boldness.n.02'), wn.synset('chivalry.n.01'), wn.synset('boldness.n.01')]
relevant_synsets['Slytherin'] = [wn.synset('resourcefulness.n.01'), wn.synset('resource.n.03'), wn.synset('craft.n.05'),
wn.synset('cunning.n.02'), wn.synset('ambition.n.01'), wn.synset('ambition.n.02'),
wn.synset('determination.n.02'), wn.synset('determination.n.04'),
wn.synset('self-preservation.n.01'), wn.synset('brotherhood.n.02'),
wn.synset('inventiveness.n.01'), wn.synset('brightness.n.02'), wn.synset('ingenuity.n.02')]
# ,
def get_forms(lemma):
drfs = lemma.derivationally_related_forms()
output_list = []
if drfs:
for drf in drfs:
drf_pos = str(drf).split(".")[1]
if drf_pos in ['n', 's', 'a']:
output_list.append(drf.name().lower())
if drf_pos in ['s', 'a']:
# + "-ness" + &
if len(drf.name()) == 3:
last_letter = drf.name()[-1:]
output_list.append(drf.name().lower() + last_letter + 'er')
output_list.append(drf.name().lower() + last_letter + 'est')
output_list.append(drf.name().lower()+'ness')
output_list.append(drf.name().lower()+'ly')
elif drf.name()[-4:] in ['able', 'ible']:
output_list.append(drf.name().lower()+'r')
output_list.append(drf.name().lower()+'st')
output_list.append(drf.name().lower()+'ness')
output_list.append(drf.name()[:-1].lower()+'y')
elif drf.name()[-1:] == 'e':
output_list.append(drf.name().lower()+'r')
output_list.append(drf.name().lower()+'st')
output_list.append(drf.name().lower()+'ness')
output_list.append(drf.name().lower()+'ly')
elif drf.name()[-2:] == 'ic':
output_list.append(drf.name().lower()+'er')
output_list.append(drf.name().lower()+'est')
output_list.append(drf.name().lower()+'ness')
output_list.append(drf.name().lower()+'ally')
elif drf.name()[-1:] == 'y':
output_list.append(drf.name()[:-1].lower()+'ier')
output_list.append(drf.name()[:-1].lower()+'iest')
output_list.append(drf.name()[:-1].lower()+'iness')
output_list.append(drf.name()[:-1].lower()+'ily')
else:
output_list.append(drf.name().lower()+'er')
output_list.append(drf.name().lower()+'est')
output_list.append(drf.name().lower()+'ness')
output_list.append(drf.name().lower()+'ly')
return output_list
else:
return output_list
#
# , , ,
import copy
new_trait_dict = copy.deepcopy(trait_dict)
antonym_dict = {}
# () ; ( )
for house, traits in trait_dict.items():
antonym_dict[house] = []
for trait in traits:
synsets = wn.synsets(trait, pos=wn.NOUN)
for synset in synsets:
if synset in relevant_synsets[house]:
for lemma in synset.lemmas():
new_trait_dict[house].append(lemma.name().lower())
if get_forms(lemma):
new_trait_dict[house].extend(get_forms(lemma))
if lemma.antonyms():
for ant in lemma.antonyms():
antonym_dict[house].append(ant.name().lower())
if get_forms(ant):
antonym_dict[house].extend(get_forms(ant))
new_trait_dict[house] = sorted(list(set(new_trait_dict[house])))
antonym_dict[house] = sorted(list(set(antonym_dict[house])))
#
print("Gryffindor traits: {}".format(new_trait_dict['Gryffindor']))
print("")
print("Gryffindor anti-traits: {}".format(antonym_dict['Gryffindor']))
print("")
# ,
from itertools import combinations
def test_overlap(dict):
results = []
house_combos = combinations(list(dict.keys()), 2)
for combo in house_combos:
results.append(set(dict[combo[0]]).isdisjoint(dict[combo[1]]))
return results
# ; "False"
print("Any words overlap in trait dictionary? {}".format(sum(test_overlap(new_trait_dict)) != 6))
print("Any words overlap in antonym dictionary? {}".format(sum(test_overlap(antonym_dict)) != 6))
# "word_tokenize",
from nltk import word_tokenize
# ,
def sort_student(text):
text_list = word_tokenize(text)
text_list = [word.lower() for word in text_list]
score_dict = {}
houses = ['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin']
for house in houses:
score_dict[house] = (sum([True for word in text_list if word in new_trait_dict[house]]) -
sum([True for word in text_list if word in antonym_dict[house]]))
sorted_house = max(score_dict, key=score_dict.get)
sorted_house_score = score_dict[sorted_house]
if sum([True for i in score_dict.values() if i==sorted_house_score]) == 1:
return sorted_house
else:
return "Tie!"
#
print(sort_student('Alice was brave'))
print(sort_student('Alice was British'))
#
pd.options.mode.chained_assignment = None
mydf_relevant['new_house'] = mydf_relevant['text'].map(lambda x: sort_student(x))
mydf_relevant.head(20)
print("Match rate: {}".format(sum(mydf_relevant['house'] == mydf_relevant['new_house']) / len(mydf_relevant)))
print("Percentage of ties: {}".format(sum(mydf_relevant['new_house'] == 'Tie!') / len(mydf_relevant)))
# --
tom_riddle = word_tokenize(mydf_relevant['text'].values[0])
tom_riddle = [word.lower() for word in tom_riddle]
# ,
words_dict = {}
anti_dict = {}
houses = ['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin']
for house in houses:
words_dict[house] = [word for word in tom_riddle if word in new_trait_dict[house]]
anti_dict[house] = [word for word in tom_riddle if word in antonym_dict[house]]
print(words_dict)
print("")
print(anti_dict)