In [1]:
%%bash
wget -qN http://www.lexique.org/listes/liste_mots.txt
wget -qN https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt

(cut -f1 liste_mots.txt | grep "[a-z]" | grep -v "[_ ]" | awk '{print "1",$0}' | shuf | head -10000;
cat words_alpha.txt | grep "[a-z]" | grep -v "[_ ]" | awk '{print "0",$0}' | shuf | head -10000) \
    | iconv -f utf8 -t ascii//TRANSLIT | shuf > words.txt
head words.txt

1 versos
0 leashing
0 nebulously
0 horselaughs
0 adjiger
1 recolla
1 saisissez
0 dovens
0 dorsocervically
0 ailurophobe


In [2]:
words = []
labels = []
with open('words.txt') as fp:
    for line in fp:
        label, word = line.strip().split()
        words.append(word)
        labels.append(int(label))

print(words[:10])
print(labels[:10])

['versos', 'leashing', 'nebulously', 'horselaughs', 'adjiger', 'recolla', 'saisissez', 'dovens', 'dorsocervically', 'ailurophobe']
[1, 0, 0, 0, 0, 1, 1, 0, 0, 0]


In [3]:
import collections
vocab = collections.defaultdict(lambda: len(vocab))

features = []
for word in words:
    text_features = list(word)
    features.append([vocab[ngram] for ngram in text_features])

print(features[:10])
print(len(vocab))

[[0, 1, 2, 3, 4, 3], [5, 1, 6, 3, 7, 8, 9, 10], [9, 1, 11, 12, 5, 4, 12, 3, 5, 13], [7, 4, 2, 3, 1, 5, 6, 12, 10, 7, 3], [6, 14, 15, 8, 10, 1, 2], [2, 1, 16, 4, 5, 5, 6], [3, 6, 8, 3, 8, 3, 3, 1, 17], [14, 4, 0, 1, 9, 3], [14, 4, 2, 3, 4, 16, 1, 2, 0, 8, 16, 6, 5, 5, 13], [6, 8, 5, 12, 2, 4, 18, 7, 4, 11, 1]]
27


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

X = torch.zeros(len(words), len(vocab))

for i, x in enumerate(features):
    for feature in x:
        X[i, feature] += 1

import random
random.shuffle(labels)
Y = torch.LongTensor(labels)
print(words[33], X[33])

befuddlers 
 0
 2
 1
 1
 0
 1
 0
 0
 0
 0
 0
 1
 1
 0
 2
 0
 0
 0
 0
 0
 0
 0
 0
 1
 0
 0
 0
[torch.FloatTensor of size 27]



In [5]:
from torch.utils.data import TensorDataset, DataLoader
train_set = TensorDataset(X, Y)
train_loader = DataLoader(train_set, batch_size=4, shuffle=True)

In [6]:
class LinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.l1 = nn.Linear(len(vocab), 2)
    def forward(self, x):
        return self.l1(x)

model = LinearModel()
model

LinearModel(
  (l1): Linear(in_features=27, out_features=2)
)

In [7]:
def fit(model, epochs):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())
    for epoch in range(epochs):
        total_loss = 0
        num = 0
        for x, y in train_loader:
            optimizer.zero_grad()
            y_scores = model(Variable(x))
            loss = criterion(y_scores, Variable(y))
            loss.backward()
            optimizer.step()
            total_loss += loss.data[0]
            num += len(y)
        print(epoch, total_loss / num)
fit(model, 10)

0 0.17422058992981912
1 0.17376696672141553
2 0.17374638694524766
3 0.17378649952113628
4 0.17372373614907266
5 0.17379510149657726
6 0.1738329892784357
7 0.17372430271208286
8 0.173788753926754
9 0.17376605225801467


In [8]:
def predict(word):
    word = '^%s$' % word
    x = torch.zeros(1, len(vocab))
    for feature in list(word):
        if feature in vocab:
            x[0, vocab[feature]] += 1
    y_scores = model(Variable(x))
    y_pred = torch.max(y_scores, 1)[1]
    return y_pred.data[0]

print(predict('coming'))
print(predict('venu'))

1
1
