Update
This blog is obselete, please head forward to this blog.
I’m tired of transform shift-jis encoding to UTF-8 encoding for each file in my project these days, so I want to write a script to automatically do this job for me. After searching the Internet, I find it’s an easy job with the tool of Python.
Python, at least 2.6 version, has a library called codecs, and all we have to do is just using this library to read and write files in different encodings.
This code transforms all files, including files in sub-folders, from shift-jis encoding(or detected encodings) to UTF-8 encoding.
Install chardet first.
1
pip install chardet
Copy this script and put it in the folder you want to do transform and run it.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/python
import os
import re
import sys
import chardet
#Created by Leon on March, 5, 2011
#Translate all files in current folder to utf-8 encoding
file_pattern = r'^.*\.(h|m|mm|cpp|inl|def|txt|js|html?|c|py|css)$'
to_encoding = 'utf-8'
def transcode(file_name):
# Backup
bk_file = file_name + '.bk'
fi = open(file_name)
fo = open(bk_file, 'w')
fo.write(fi.read())
fo.close()
fi.close()
# Trans
fin = open(bk_file)
succeed = True
try:
data = fin.read()
c = chardet.detect(data)
if c is None or c['confidence'] < 0.618:
raise Exception
if c['encoding'] != to_encoding:
if c['encoding'] in ('GB2312', 'GBK'):
c['encoding'] = 'GB18030'
print file_name + ': ' + c['encoding'] + ' ==> ' + to_encoding
data = unicode(data, encoding=c['encoding']).encode(to_encoding)
fout = open(file_name, 'w')
fout.write(data)
fout.close()
except:
succeed = False
print file_name + '\'s encoding not known.'
fin.close()
os.remove(bk_file)
return succeed
path = os.path.abspath(os.path.dirname(sys.argv[0]))
print "Current Path: " + path
errors = []
for dirpath, dirs, files in os.walk(path):
for filename in files:
if re.search(file_pattern, filename) and filename != __file__:
print filename + ' ... '
if not transcode(os.path.join(dirpath, filename)):
errors.append(filename)
if errors:
print "--------------------------------------------------------"
print "These files got error:"
for err in errors:
print err
print "--------------------------------------------------------"
else:
print
print "All files have been translated successfully."
print
print "Created for you by Leon on March, 5, 2011."
raw_input()
Actually this script can detect the encoding of files, and transform all files not in utf-8, like shift-jis, gbk, gb2312, asscii(trival) or cp936 etc to utf-8 encoding.