KMNIST

Similar to @earth's penguin EMNIST project, I am developing a CNN (Conventional Neural Network) in python to parse a KMNIST dataset. A quick summary of what this dataset consists of: 60,000 training and 10,000 testing examples of handwritten Kuzushiji Hiragana characters (Cursive Japanese Characters). Do I know japanese? No Is it a cool AI project? Yes
4 Replies
assange
assange4w ago
This script pretty much grabs a random label from the dataset, parses it, and sends it back to you with a ascii version of it as well as a matplotlib version of it.
def parse():
train_file = 'samples/train-images-idx3-ubyte'
test_file = 'samples/t10k-images-idx3-ubyte'
test_images, test_labels = mndata.load_testing()
train_images, train_labels = mndata.load_training()

train_arr = idx2numpy.convert_from_file(train_file) / 255.0
test_arr = idx2numpy.convert_from_file(test_file) / 255.0

train_arr = train_arr.flatten().reshape(-1, 28 * 28)
test_arr = test_arr.flatten().reshape(-1, 28 * 28)

def randoImage(arr, labels, filename):
index = random.randrange(len(arr))
data = {
'image': arr[index].tolist(),
'label': labels[index]

}
with open(filename, 'w') as f:
json.dump(data, f, indent=4)

#save images
randoImage(train_arr, train_labels, 'KMNIST-Train.json')
randoImage(test_arr, test_labels, 'KMNIST-Test.json')

#Display random images from training dataset
index = random.randrange(len(train_arr))
print(mndata.display(train_images[index]))

image = np.array(train_images[index]) / 255.0
label = np.array(train_labels[index])

with open('image.json', 'w') as f:
json.dump(image.tolist(), f, indent=4)
with open('label.json', 'w') as f:
json.dump(label.tolist(), f, indent=4)


#matplotlib
plt.imshow(image.reshape(28, 28), cmap='gray')
plt.title(f'Label: {label}')
plt.show()


return image, label

parse()
def parse():
train_file = 'samples/train-images-idx3-ubyte'
test_file = 'samples/t10k-images-idx3-ubyte'
test_images, test_labels = mndata.load_testing()
train_images, train_labels = mndata.load_training()

train_arr = idx2numpy.convert_from_file(train_file) / 255.0
test_arr = idx2numpy.convert_from_file(test_file) / 255.0

train_arr = train_arr.flatten().reshape(-1, 28 * 28)
test_arr = test_arr.flatten().reshape(-1, 28 * 28)

def randoImage(arr, labels, filename):
index = random.randrange(len(arr))
data = {
'image': arr[index].tolist(),
'label': labels[index]

}
with open(filename, 'w') as f:
json.dump(data, f, indent=4)

#save images
randoImage(train_arr, train_labels, 'KMNIST-Train.json')
randoImage(test_arr, test_labels, 'KMNIST-Test.json')

#Display random images from training dataset
index = random.randrange(len(train_arr))
print(mndata.display(train_images[index]))

image = np.array(train_images[index]) / 255.0
label = np.array(train_labels[index])

with open('image.json', 'w') as f:
json.dump(image.tolist(), f, indent=4)
with open('label.json', 'w') as f:
json.dump(label.tolist(), f, indent=4)


#matplotlib
plt.imshow(image.reshape(28, 28), cmap='gray')
plt.title(f'Label: {label}')
plt.show()


return image, label

parse()
'''
This piece of the code is what really flattens the dataset


train_file = 'samples/train-images-idx3-ubyte'
test_file = 'samples/t10k-images-idx3-ubyte'
test_images, test_labels = mndata.load_testing()
train_images, train_labels = mndata.load_training()

train_arr = idx2numpy.convert_from_file(train_file) / 255.0
test_arr = idx2numpy.convert_from_file(test_file) / 255.0

train_arr = train_arr.flatten().reshape(-1, 28 * 28)
test_arr = test_arr.flatten().reshape(-1, 28 * 28)

'''
'''
This piece of the code is what really flattens the dataset


train_file = 'samples/train-images-idx3-ubyte'
test_file = 'samples/t10k-images-idx3-ubyte'
test_images, test_labels = mndata.load_testing()
train_images, train_labels = mndata.load_training()

train_arr = idx2numpy.convert_from_file(train_file) / 255.0
test_arr = idx2numpy.convert_from_file(test_file) / 255.0

train_arr = train_arr.flatten().reshape(-1, 28 * 28)
test_arr = test_arr.flatten().reshape(-1, 28 * 28)

'''
I made a testing script to put the parsed and flattened arrays into a json file, and it worked!
import numpy as np
import json
import random
from mnist import MNIST
import idx2numpy
import matplotlib.pyplot as plt
mndata = MNIST('samples')

def test_parse():
train_file = 'samples/train-images-idx3-ubyte'
test_file = 'samples/t10k-images-idx3-ubyte'
test_images, test_labels = mndata.load_testing()
train_images, train_labels = mndata.load_training()

train_arr = idx2numpy.convert_from_file(train_file) / 255.0
test_arr = idx2numpy.convert_from_file(test_file) / 255.0

train_arr = train_arr.flatten().reshape(-1, 28 * 28)
test_arr = test_arr.flatten().reshape(-1, 28 * 28)

with open('KMNIST-FILETEST-Train.json', 'w') as f:
json.dump(train_arr.tolist(), f, indent=4)
with open('KMNIST-FILETEST-Test.json', 'w') as f:
json.dump(test_arr.tolist(), f, indent=4)

print("Parsing Complete")

test_parse()
import numpy as np
import json
import random
from mnist import MNIST
import idx2numpy
import matplotlib.pyplot as plt
mndata = MNIST('samples')

def test_parse():
train_file = 'samples/train-images-idx3-ubyte'
test_file = 'samples/t10k-images-idx3-ubyte'
test_images, test_labels = mndata.load_testing()
train_images, train_labels = mndata.load_training()

train_arr = idx2numpy.convert_from_file(train_file) / 255.0
test_arr = idx2numpy.convert_from_file(test_file) / 255.0

train_arr = train_arr.flatten().reshape(-1, 28 * 28)
test_arr = test_arr.flatten().reshape(-1, 28 * 28)

with open('KMNIST-FILETEST-Train.json', 'w') as f:
json.dump(train_arr.tolist(), f, indent=4)
with open('KMNIST-FILETEST-Test.json', 'w') as f:
json.dump(test_arr.tolist(), f, indent=4)

print("Parsing Complete")

test_parse()
...i think
assange
assange4w ago
So it ended up creating the files and for a split second i was was able to see floats :D but because of how data there there was per file crashed :(
No description
assange
assange4w ago
output:
No description
Dumb Bird
Dumb Bird3w ago
If you don't know japanese how do you verify if it's working correctly