Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F85661490
encdet_utf8.py.wml
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Sep 30, 21:39
Size
8 KB
Mime Type
text/x-python
Expires
Wed, Oct 2, 21:39 (2 d)
Engine
blob
Format
Raw Data
Handle
21215436
Attached To
R3600 invenio-infoscience
encdet_utf8.py.wml
View Options
<protect>#!/usr/bin/env python
# -*- encoding: utf8 -*-
# Converted from the original japanese.ms932 encoding to utf8.
# encdet.py - An encoding detector
# by Yusuke Shinyama
# * public domain *
import sys, re
## EncodingRecognizer
## - a finite automaton which receives octets
##
class EncodingRecognizer:
SCORE_DEFAULT = 0.5
DEATH_PENALTY = -100
GIVEUP_THRESHOLD = -1000
# character sets: must be exclusive!
CHARSET = [
# zenkaku-kana
(1.5, re.compile(u"[ぁ-ん]"), 0x01),
(1.5, re.compile(u"[ァ-ヴ]"), 0x02),
(1.0, re.compile(u"[ーヽヾゝゞ]"), 0x03),
# hankaku latin
(1.2, re.compile(u"[a-zA-Z0-9]"), 0x04),
(0.0, re.compile(u"[\u00c0-\u00ff]"), 0x04),
# hankaku-kana
(0.8, re.compile(u"[\uff66-\uff9d]"), 0x08),
# zenkaku-alphanum
(1.2, re.compile(u"[A-Za-z0-9]"), 0x10),
# kanji
(1.0, re.compile(u"[\u4e00-\u9fff]"), 0x20),
]
def __init__(self, encoding):
self.encoding = encoding
self.ch = ""
self.state = 1
self.partial_score = 0.0
self.total_score = 0.0
self.chunk_type = 0
return
def __repr__(self):
return "<EncodingRecognizer: %s, state=%d, chunk_type=%s, partial_score=%d, total_score=%d>" % \
(self.encoding, self.state, self.chunk_type, self.partial_score, self.total_score)
def die(self):
#print "died:", self
self.total_score += self.DEATH_PENALTY
if self.total_score <= self.GIVEUP_THRESHOLD:
# game is over...
#print "giveup:", self
self.state = 0
else:
# try again...
self.state = 1
self.partial_score = 0
self.ch = ""
return
def flush(self):
self.total_score += self.partial_score * self.partial_score
self.partial_score = 0.0
return
def accept(self, s):
try:
c = unicode(s, self.encoding)
except UnicodeError:
c = ""
for (score, pat, flags) in self.CHARSET:
if pat.match(c):
if self.chunk_type == 0 or not (self.chunk_type & flags):
self.flush()
self.chunk_type = flags
self.partial_score += score
break
else:
self.flush()
self.chunk_type = 0
self.partial_score += self.SCORE_DEFAULT
return
def finish(self):
self.flush()
if 1 < self.state:
self.die()
return
## CHARACTER SETS
## ISO-8859-*
##
class ISO8859_Recognizer(EncodingRecognizer):
def __init__(self):
return EncodingRecognizer.__init__(self, "iso8859_1")
def feed(self, c):
if self.state == 0: # already dead?
return
elif self.state == 1: # ascii or iso?
if c < 0x7f or (0xa0 <= c and c <= 0xff):
self.state = 1
self.accept(chr(c))
else:
self.die()
return
## EUC-JP
##
class EUCJP_Recognizer(EncodingRecognizer):
def __init__(self):
self.hankaku = False
return EncodingRecognizer.__init__(self, "japanese.euc_jp")
def feed(self, c):
if self.state == 0: # already dead?
return
# 1stbyte
elif self.state == 1:
if c < 0x7f: # ascii?
# succeed
self.state = 1
self.accept(chr(c))
self.ch = ""
# IGNORE EUC-JP hankaku chars, no one is using
# elif 0x8e == c: # hankaku-kana 1stbyte?
# # next
# self.state = 2
# self.ch = chr(c)
# self.hankaku = True
elif 0xa1 <= c and c <= 0xfe: # kanji 1stbyte?
# next
self.state = 2
self.ch = chr(c)
self.hankaku = False
else:
self.die()
# 2ndbyte
elif self.state == 2:
if self.hankaku and (0xa1 <= c and c <= 0xdf): # hankaku-kana 2ndbyte?
# succeed
self.ch += chr(c)
self.accept(self.ch)
self.state = 1
self.ch = ""
elif not self.hankaku and (0xa1 <= c and c <= 0xfe): # kanji 2ndbyte?
# succeed
self.ch += chr(c)
self.accept(self.ch)
self.state = 1
self.ch = ""
else:
self.die()
return
## CP932
##
class CP932_Recognizer(EncodingRecognizer):
def __init__(self):
return EncodingRecognizer.__init__(self, "japanese.ms932")
def feed(self, c):
if self.state == 0: # already dead?
return
# 1stbyte
elif self.state == 1:
if c < 0x7f: # ascii?
# succeed
self.state = 1
self.accept(chr(c))
self.ch = ""
elif 0xa1 <= c and c <= 0xdf: # hankaku-kana?
# succeed
self.state = 1
self.accept(chr(c))
self.ch = ""
elif (0x81 <= c and c <= 0x9f) or (0xe0 <= c and c <= 0xee) \
or (0xfa <= c and c <= 0xfc): # kanji 1stbyte?
# next
self.state = 2
self.ch = chr(c)
else:
self.die()
# 2ndbyte
elif self.state == 2:
if 0x40 <= c and c <= 0xfc and c != 0x7f: # kanji 2ndbyte?
# succeed
self.accept(self.ch+chr(c))
self.state = 1
self.ch = ""
else:
self.die()
return
## UTF-8
##
class UTF8_Recognizer(EncodingRecognizer):
def __init__(self):
self.left = 0
return EncodingRecognizer.__init__(self, "utf8")
def feed(self, c):
if self.state == 0: # already dead?
return
# 1stbyte
elif self.state == 1:
if c <= 0x7f: # 00xxxxxx: 1byte only?
# succeed
self.state = 1
self.accept(chr(c))
self.ch = ""
elif c & 0xe0 == 0xc0: # 110xxxxx: 2bytes
# next
self.state = 2
self.left = 1
self.ch = chr(c)
elif c & 0xf0 == 0xe0: # 1110xxxx: 3bytes
# next
self.state = 2
self.left = 2
self.ch = chr(c)
elif c & 0xf8 == 0xf0: # 11110xxx: 4bytes
# next
self.state = 2
self.left = 3
self.ch = chr(c)
elif c & 0xfc == 0xf8: # 111110xx: 5bytes
# next
self.state = 2
self.left = 4
self.ch = chr(c)
else:
self.die()
# n-th byte (where 2<=n)
else:
if c & 0xc0 == 0x80: # 10xxxxxx: continuous?
self.state += 1
self.left -= 1
self.ch += chr(c)
if self.left == 0: # finished?
# succeed
self.state = 1
self.accept(self.ch)
self.ch = ""
else:
# next
pass
else:
self.die()
return
# guess
def guess(s):
recognizer = [
EUCJP_Recognizer(),
CP932_Recognizer(),
ISO8859_Recognizer(),
UTF8_Recognizer()
]
for c in s:
for r in recognizer:
r.feed(ord(c))
for r in recognizer:
r.finish()
#print r
recognizer.sort(lambda a,b: cmp(b.total_score, a.total_score))
return recognizer[0].encoding
# test suite
def test(s0, test_encodings):
false_encodings = [ "japanese.euc_jp", "japanese.ms932", "utf8", "iso8859_1" ]
for enc1 in test_encodings:
try:
s = s0.encode(enc1)
except UnicodeError:
continue
print "try '%s' in %s (%s)" % (s0.encode('utf8'), enc1.encode('utf8'), " ".join(map(lambda c:"%02x" % ord(c), s)))
for enc2 in false_encodings:
if enc1 != enc2:
try:
x = str(unicode(s, enc2))
print " (could be: '%s' in %s)" % (x, enc2)
except UnicodeError:
continue
genc = guess(s)
if genc == enc1:
print " CORRECT:", genc
else:
print " ! INCORRECT:", genc
print
return
def test_suite():
# kana only
test(u"こんにちは", ["japanese.euc_jp", "japanese.ms932", "utf8"])
# kana + alphanum
test(u"AはBとCである", ["japanese.euc_jp", "japanese.ms932", "utf8"])
# kana + kanji
test(u"毎朝新聞ニュース", ["japanese.euc_jp", "japanese.ms932", "utf8"])
# kanji + hankakukana
test(u"無題ドキュメント", ["japanese.ms932", "utf8"])
# iso8859-1
test(u"Enzyklop\u00e4die", ["utf8", "iso8859_1"])
return
# main
test_suite(); sys.exit(0)
if __name__ == "__main__":
import fileinput
for s in fileinput.input():
print guess(s)</protect>
Event Timeline
Log In to Comment