ok I have a large body of text and I am trying to count all the words in it to get their frequencies. It is too big to keep a dictionary in memory so I am using sqlite for this task. I wrote a Database class and used the word as a primary key because it should be unique.
Somewhere in the process it is failing because the word 'frequency' is trying to be added twice. I have no idea why this could be happenning as there were MANY other common words that must have been counted twice already... Here are my two files....
dbmaker.py
import sqlite3
import sys
class Database(): '''Creates an object with method for adding and checking words against an sqlite database'''
def __init__(self):
self.dbname = raw_input('What will be the name of this database: ')
self.table1 = 'words'
self.column1 = 'word_name'
self.column1t = 'TEXT'
self.column2 = 'frequency'
self.column2t = 'INTEGER'
self.conn = sqlite3.connect(self.dbname)
self.c = self.conn.cursor()
self.c.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY)'.\
format(tn=self.table1, nf=self.column1, ft=self.column1t))
self.c.execute('ALTER TABLE {tn} ADD COLUMN {cn} {ct}'.\
format(tn=self.table1, cn=self.column2, ct=self.column2t))
self.conn.commit()
self.checkfile = open('check.txt', 'w+')
def check_word(self, word_name):
exist = self.c.execute('SELECT * FROM {tn} WHERE {cn}="{wn}"'.\
format(tn=self.table1, cn=self.column1, wn=word_name))
exist = self.c.fetchall()
if exist:
new_freq = exist[0][1] + 1
self.c.execute("UPDATE {tn} SET {c2n}={en} WHERE {c1n}='{word}'".\
format(tn=self.table1, c2n=self.column2, en=new_freq, c1n=self.column1, word=word_name))
return True
else:
return False
def add_word(self, word_name, frequency=1):
self.checkfile.write('%s\n' % word_name)
self.c.execute('INSERT INTO {tn} ({c1n}, {c2n}) VALUES ("{wn}", {f})'.\
format(tn=self.table1, c1n=self.column1, c2n=self.column2, wn=word_name, f=frequency))
crunch.py
import sys
import operator
from dbmaker import Database
db = Database()
infile = open(sys.argv[1])
for line in infile:
line = line.split()
for word in line:
word=word.lower()
if not db.check_word(word):
db.add_word(word)
db.conn.commit() db.conn.close()
Here is my traceback:
Traceback (most recent call last):
File "crunch.py", line 13, in <module>
db.add_word(word)
File "/home/ubuntu/wikicorpus/dbmaker.py", line 44, in add_word
format(tn=self.table1, c1n=self.column1, c2n=self.column2, wn=word_name, f=frequency))
sqlite3.IntegrityError: UNIQUE constraint failed: words.word_name
Aucun commentaire:
Enregistrer un commentaire