1
0
mirror of https://github.com/chanind/hanzi-writer-data.git synced 2025-06-17 01:09:56 +08:00
hanzi-writer-data/stroke_data_parser.py
2018-02-03 22:49:18 +08:00

95 lines
2.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os
from copy import copy
root = os.path.dirname(__file__)
dictionary_file = os.path.join(root, 'vendor/makemeahanzi/dictionary.txt')
graphics_file = os.path.join(root, 'vendor/makemeahanzi/graphics.txt')
output_dir = os.path.join(root, 'data')
positioners = {
'': 2,
'': 2,
'': 3,
'': 3,
'': 2,
'': 2,
'': 2,
'': 2,
'': 2,
'': 2,
'': 2,
'': 2,
}
missing_marker = ''
graphics_data = {}
dict_data = {}
with open(dictionary_file) as f:
lines = f.readlines()
for line in lines:
decoded_line = json.loads(line)
dict_data[decoded_line['character']] = decoded_line
with open(graphics_file) as f:
lines = f.readlines()
for line in lines:
decoded_line = json.loads(line)
char = decoded_line.pop('character')
graphics_data[char] = decoded_line
def get_decomp_index(char, subchar):
"Parse the decomposition tree to figure out what the index of the subchar is within the char"
stack = []
for piece in dict_data[char]['decomposition']:
last_node = None
path = []
if len(stack) > 0:
last_node = stack.pop()
path = copy(last_node['path'])
path.append(last_node['children'])
last_node['children'] += 1
if last_node['children'] < last_node['size']:
stack.append(last_node)
if piece in positioners:
node = {
'size': positioners[piece],
'children': 0,
'path': path,
}
stack.append(node)
elif piece == subchar:
return path
return None
def get_radical_strokes(char):
radical = dict_data[char]['radical']
if char == radical:
return None
decomp_index = get_decomp_index(char, radical)
if not decomp_index:
return None
rad_strokes = []
for stroke_num, match_index in enumerate(dict_data[char]['matches']):
if match_index == decomp_index:
rad_strokes.append(stroke_num)
return rad_strokes
# write out data
for char in graphics_data:
radical = get_radical_strokes(char)
if radical:
graphics_data[char]['radStrokes'] = radical
for char, data in graphics_data.items():
out_file = os.path.join(output_dir, f'{char}.json')
with open(out_file, 'w') as f:
f.write(json.dumps(data, ensure_ascii=False))
with open(os.path.join(output_dir, 'all.json'), 'w') as f:
f.write(json.dumps(graphics_data, ensure_ascii=False))