dimanche 27 mars 2016

python + sqlite failing to insert on violating unique constraint

ok I have a large body of text and I am trying to count all the words in it to get their frequencies. It is too big to keep a dictionary in memory so I am using sqlite for this task. I wrote a Database class and used the word as a primary key because it should be unique.

Somewhere in the process it is failing because the word 'frequency' is trying to be added twice. I have no idea why this could be happenning as there were MANY other common words that must have been counted twice already... Here are my two files....

dbmaker.py

import sqlite3

import sys

class Database(): '''Creates an object with method for adding and checking words against an sqlite database'''

def __init__(self):
    self.dbname = raw_input('What will be the name of this database: ')
    self.table1 = 'words'
    self.column1 = 'word_name'
    self.column1t = 'TEXT'
    self.column2 = 'frequency'
    self.column2t = 'INTEGER'
    self.conn = sqlite3.connect(self.dbname)
    self.c = self.conn.cursor()
    self.c.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY)'.\
        format(tn=self.table1, nf=self.column1, ft=self.column1t))
    self.c.execute('ALTER TABLE {tn} ADD COLUMN {cn} {ct}'.\
        format(tn=self.table1, cn=self.column2, ct=self.column2t))
    self.conn.commit()
    self.checkfile = open('check.txt', 'w+')



def check_word(self, word_name):
    exist = self.c.execute('SELECT * FROM {tn} WHERE {cn}="{wn}"'.\
        format(tn=self.table1, cn=self.column1, wn=word_name))

    exist = self.c.fetchall()
    if exist:
        new_freq = exist[0][1] + 1
        self.c.execute("UPDATE {tn} SET {c2n}={en} WHERE {c1n}='{word}'".\
            format(tn=self.table1, c2n=self.column2, en=new_freq, c1n=self.column1, word=word_name))
        return True
    else:
        return False



def add_word(self, word_name, frequency=1):
    self.checkfile.write('%s\n' % word_name)
    self.c.execute('INSERT INTO {tn} ({c1n}, {c2n}) VALUES ("{wn}", {f})'.\
        format(tn=self.table1, c1n=self.column1, c2n=self.column2, wn=word_name, f=frequency))

crunch.py

import sys
import operator
from dbmaker import Database

db = Database()
infile = open(sys.argv[1])

for line in infile:
    line = line.split()
    for word in line:
        word=word.lower()
        if not db.check_word(word):
            db.add_word(word)

db.conn.commit() db.conn.close()

Here is my traceback:

Traceback (most recent call last):
  File "crunch.py", line 13, in <module>
    db.add_word(word)
  File "/home/ubuntu/wikicorpus/dbmaker.py", line 44, in add_word
    format(tn=self.table1, c1n=self.column1, c2n=self.column2, wn=word_name, f=frequency))

sqlite3.IntegrityError: UNIQUE constraint failed: words.word_name

Aucun commentaire:

Enregistrer un commentaire