Last year I re-posted some obsure Kana & Romaji converter code for Ruby. Although I had fixed it to work with modern versions of Ruby, the romanization converter just didn’t want to work! Finally after dedicating an afternoon to it, I found that the regex used to split the source string was returning nil. At some point in the distant past it worked, and then broke irreversibly.
In posting this update, I have also added support for “tsu” being transliterated to “つ” which the original author (being Japanese) did not include. In working this out, I discovered a bit about the internal design. As such I have added some helpful comments (helpful if they’d been there originally!)
Full credit goes to the mysterious K.Kodama who wrote this in 2002. Some 8 years later, now that it works better, this is the best kana/romaji converter available in Ruby!
Full code is included after the break!
# kana2rom.rb
#
# This script is distributed freely in the sense of GNU General Public License.
# K.Kodama 2002.06
#
# Cleaned up and repaired to work with modern Ruby versions, added more comments
# crunchyt a~/t crunchytoast.com 2010.04
#
# Kana2rom::kana2rom(str) かな-->ロ-マ字 変換 ### Romaji conversion functions to not work!!!
# Kana2rom::rom2kata(str) ロ-マ字-->片仮名 変換
# Kana2rom::rom2hira(str) ロ-マ字-->平仮名 変換
# Kana2rom::hira2kata(str) 平仮名-->片仮名 変換
# Kana2rom::kata2hira(str) 片仮名-->平仮名 変換
# Kana2rom::kana2kana(str) attempts either to either, returns unique strings only
#
module Kana2rom
Kana2romH={
"ア"=>"a", "イ"=>"i", "ウ"=>"u", "エ"=>"e","オ"=>"o",
"あ"=>"a", "い"=>"i", "う"=>"u", "え"=>"e","お"=>"o",
"カ"=>"ka", "キ"=>"ki", "ク"=>"ku", "ケ"=>"ke", "コ"=>"ko",
"か"=>"ka", "き"=>"ki", "く"=>"ku", "け"=>"ke", "こ"=>"ko",
"ガ"=>"ga", "ギ"=>"gi", "グ"=>"gu", "ゲ"=>"ge", "ゴ"=>"go",
"が"=>"ga", "ぎ"=>"gi", "ぐ"=>"gu", "げ"=>"ge", "ご"=>"go",
"サ"=>"sa", "シ"=>"si", "ス"=>"su", "セ"=>"se", "ソ"=>"so",
"さ"=>"sa", "し"=>"si", "す"=>"su", "せ"=>"se", "そ"=>"so",
"ザ"=>"za", "ジ"=>"ji", "ズ"=>"zu", "ゼ"=>"ze", "ゾ"=>"zo",
"ざ"=>"za", "じ"=>"ji", "ず"=>"zu", "ぜ"=>"ze", "ぞ"=>"zo",
"タ"=>"ta", "チ"=>"chi", "ツ"=>"tsu", "テ"=>"te", "ト"=>"to",
"た"=>"ta", "ち"=>"chi", "つ"=>"tsu", "て"=>"te", "と"=>"to",
"ダ"=>"da", "ヂ"=>"di", "ヅ"=>"du", "デ"=>"de", "ド"=>"do",
"だ"=>"da", "ぢ"=>"di", "づ"=>"du", "で"=>"de", "ど"=>"do",
"ナ"=>"na", "ニ"=>"ni", "ヌ"=>"nu", "ネ"=>"ne", "ノ"=>"no",
"な"=>"na", "に"=>"ni", "ぬ"=>"nu", "ね"=>"ne", "の"=>"no",
"ハ"=>"ha", "ヒ"=>"hi", "フ"=>"hu", "ヘ"=>"he", "ホ"=>"ho",
"は"=>"ha", "ひ"=>"hi", "ふ"=>"hu", "へ"=>"he", "ほ"=>"ho",
"バ"=>"ba", "ビ"=>"bi", "ブ"=>"bu", "ベ"=>"be", "ボ"=>"bo",
"ば"=>"ba", "び"=>"bi", "ぶ"=>"bu", "べ"=>"be", "ぼ"=>"bo",
"パ"=>"pa", "ピ"=>"pi", "プ"=>"pu", "ペ"=>"pe", "ポ"=>"po",
"ぱ"=>"pa", "ぴ"=>"pi", "ぷ"=>"pu", "ぺ"=>"pe", "ぽ"=>"po",
"マ"=>"ma", "ミ"=>"mi", "ム"=>"mu", "メ"=>"me", "モ"=>"mo",
"ま"=>"ma", "み"=>"mi", "む"=>"mu", "め"=>"me", "も"=>"mo",
"ヤ"=>"ya", "ユ"=>"yu", "ヨ"=>"yo",
"や"=>"ya", "ゆ"=>"yu", "よ"=>"yo",
"ラ"=>"ra","リ"=>"ri","ル"=>"ru","レ"=>"re","ロ"=>"ro",
"ら"=>"ra","り"=>"ri","る"=>"ru","れ"=>"re","ろ"=>"ro",
"ワ"=>"wa","ヰ"=>"wi", "ヱ"=>"we", "ヲ"=>"wo", "ン"=>"nn",
"わ"=>"wa","ゐ"=>"wi", "ゑ"=>"we", "を"=>"wo", "ん"=>"nn",
"ァ"=>"xa", "ィ"=>"xi", "ゥ"=>"xu", "ェ"=>"xe", "ォ"=>"xo",
"ぁ"=>"xa", "ぃ"=>"xi", "ぅ"=>"xu", "ぇ"=>"xe", "ぉ"=>"xo",
"ッ"=>"xtu", "ャ"=>"xya", "ュ"=>"xyu", "ョ"=>"xyo",
"っ"=>"xtu", "ゃ"=>"xya", "ゅ"=>"xyu", "ょ"=>"xyo",
"ヴ"=>"vu","ヵ"=>"xka","ヶ"=>"ga","ヮ"=>"xwa",
"ゎ"=>"xwa",
"ー"=>"-", "−"=>"-", "゛"=>'"', "゜"=>"'"}
# 1 character romaji patterns
Rom2KataH1={
"a"=>"ア", "i"=>"イ", "u"=>"ウ", "e"=>"エ", "o"=>"オ", "-"=>"ー"
}
# 2 character romaji patterns
Rom2KataH2={
"xa"=>"ァ", "xi"=>"ィ", "xu"=>"ゥ", "xe"=>"ェ", "xo"=>"ォ",
"ka"=>"カ", "ki"=>"キ", "ku"=>"ク", "ke"=>"ケ", "ko"=>"コ",
"ca"=>"カ", "cu"=>"ク", "co"=>"コ",
"ga"=>"ガ", "gi"=>"ギ", "gu"=>"グ", "ge"=>"ゲ", "go"=>"ゴ",
"sa"=>"サ", "si"=>"シ", "su"=>"ス", "se"=>"セ", "so"=>"ソ",
"za"=>"ザ", "zi"=>"ジ", "zu"=>"ズ", "ze"=>"ゼ", "zo"=>"ゾ",
"ja"=>"ジャ","ji"=>"ジ", "ju"=>"ジュ","je"=>"ジェ","jo"=>"ジョ",
"ta"=>"タ", "ti"=>"チ", "tu"=>"ツ", "te"=>"テ", "to"=>"ト",
"da"=>"ダ", "di"=>"ヂ", "du"=>"ヅ", "de"=>"デ", "do"=>"ド",
"na"=>"ナ", "ni"=>"ニ", "nu"=>"ヌ", "ne"=>"ネ", "no"=>"ノ",
"ha"=>"ハ", "hi"=>"ヒ", "hu"=>"フ", "he"=>"ヘ", "ho"=>"ホ",
"ba"=>"バ", "bi"=>"ビ", "bu"=>"ブ", "be"=>"ベ", "bo"=>"ボ",
"pa"=>"パ", "pi"=>"ピ", "pu"=>"プ", "pe"=>"ペ", "po"=>"ポ",
"va"=>"ヴァ","vi"=>"ヴィ","vu"=>"ヴ", "ve"=>"ヴェ","vo"=>"ヴォ",
"fa"=>"ファ","fi"=>"フィ","fu"=>"フ", "fe"=>"フェ","fo"=>"フォ",
"ma"=>"マ", "mi"=>"ミ", "mu"=>"ム", "me"=>"メ", "mo"=>"モ",
"ya"=>"ヤ", "yi"=>"イ", "yu"=>"ユ", "ye"=>"イェ", "yo"=>"ヨ",
"ra"=>"ラ", "ri"=>"リ", "ru"=>"ル", "re"=>"レ", "ro"=>"ロ",
"la"=>"ラ", "li"=>"リ", "lu"=>"ル", "le"=>"レ", "lo"=>"ロ",
"wa"=>"ワ", "wi"=>"ヰ", "wu"=>"ウ", "we"=>"ヱ", "wo"=>"ヲ",
"nn"=>"ン"
}
# 3 character romaji patterns
Rom2KataH3={
"tsu"=>"ツ",
"xka"=>"ヵ", "xke"=>"ヶ",
"xwa"=>"ヮ", "xtu"=>"ッ", "xya"=>"ャ", "xyu"=>"ュ", "xyo"=>"ョ",
"kya"=>"キャ", "kyi"=>"キィ", "kyu"=>"キュ", "kye"=>"キェ", "kyo"=>"キョ",
"gya"=>"ギャ", "gyi"=>"ギィ", "gyu"=>"ギュ", "gye"=>"ギェ", "gyo"=>"ギョ",
"sya"=>"シャ", "syi"=>"シィ", "syu"=>"シュ", "sye"=>"シェ", "syo"=>"ショ",
"sha"=>"シャ", "shi"=>"シ", "shu"=>"シュ", "she"=>"シェ", "sho"=>"ショ",
"zya"=>"ジャ", "zyi"=>"ジィ", "zyu"=>"ジュ", "zye"=>"ジェ", "zyo"=>"ジョ",
"jya"=>"ジャ", "jyi"=>"ジィ", "jyu"=>"ジュ", "jye"=>"ジェ", "jyo"=>"ジョ",
"tya"=>"チャ", "tyi"=>"チィ", "tyu"=>"チュ", "tye"=>"チェ", "tyo"=>"チョ",
"cya"=>"チャ", "cyi"=>"チィ", "cyu"=>"チュ", "cye"=>"チェ", "cyo"=>"チョ",
"cha"=>"チャ", "chi"=>"チ", "chu"=>"チュ", "che"=>"チェ", "cho"=>"チョ",
"tha"=>"テャ", "thi"=>"ティ", "thu"=>"テュ", "the"=>"テェ", "tho"=>"テョ",
"dya"=>"ヂャ", "dyi"=>"ヂィ", "dyu"=>"ヂュ", "dye"=>"ヂェ", "dyo"=>"ヂョ",
"dha"=>"デャ", "dhi"=>"ディ", "dhu"=>"デュ", "dhe"=>"デェ", "dho"=>"デョ",
"nya"=>"ニャ", "nyi"=>"ニィ", "nyu"=>"ニュ", "nye"=>"ニェ", "nyo"=>"ニョ",
"hya"=>"ヒャ", "hyi"=>"ヒィ", "hyu"=>"ヒュ", "hye"=>"ヒェ", "hyo"=>"ヒョ",
"bya"=>"ビャ", "byi"=>"ビィ", "byu"=>"ビュ", "bye"=>"ビェ", "byo"=>"ビョ",
"pya"=>"ピャ", "pyi"=>"ピィ", "pyu"=>"ピュ", "pye"=>"ピェ", "pyo"=>"ピョ",
"mya"=>"ミャ", "myi"=>"ミィ", "myu"=>"ミュ", "mye"=>"ミェ", "myo"=>"ミョ",
"rya"=>"リャ", "ryi"=>"リィ", "ryu"=>"リュ", "rye"=>"リェ", "ryo"=>"リョ",
"lya"=>"リャ", "lyi"=>"リィ", "lyu"=>"リュ", "lye"=>"リェ", "lyo"=>"リョ"
}
Kata2hiraH={
"ア"=>"あ", "イ"=>"い", "ウ"=>"う", "エ"=>"え", "オ"=>"お",
"カ"=>"か", "キ"=>"き", "ク"=>"く", "ケ"=>"け", "コ"=>"こ",
"ガ"=>"が", "ギ"=>"ぎ", "グ"=>"ぐ", "ゲ"=>"げ", "ゴ"=>"ご",
"サ"=>"さ", "シ"=>"し", "ス"=>"す", "セ"=>"せ", "ソ"=>"そ",
"ザ"=>"ざ", "ジ"=>"じ", "ズ"=>"ず", "ゼ"=>"ぜ", "ゾ"=>"ぞ",
"タ"=>"た", "チ"=>"ち", "ツ"=>"つ", "テ"=>"て", "ト"=>"と",
"ダ"=>"だ", "ヂ"=>"ぢ", "ヅ"=>"づ", "デ"=>"で", "ド"=>"ど",
"ナ"=>"な", "ニ"=>"に", "ヌ"=>"ぬ", "ネ"=>"ね", "ノ"=>"の",
"ハ"=>"は", "ヒ"=>"ひ", "フ"=>"ふ", "ヘ"=>"へ", "ホ"=>"ほ",
"バ"=>"ば", "ビ"=>"び", "ブ"=>"ぶ", "ベ"=>"べ", "ボ"=>"ぼ",
"パ"=>"ぱ", "ピ"=>"ぴ", "プ"=>"ぷ", "ペ"=>"ぺ", "ポ"=>"ぽ",
"マ"=>"ま", "ミ"=>"み", "ム"=>"む", "メ"=>"め", "モ"=>"も",
"ヤ"=>"や", "ユ"=>"ゆ", "ヨ"=>"よ",
"ラ"=>"ら", "リ"=>"り", "ル"=>"る", "レ"=>"れ", "ロ"=>"ろ",
"ワ"=>"わ", "ヰ"=>"ゐ", "ヱ"=>"ゑ", "ヲ"=>"を", "ン"=>"ん",
"ァ"=>"ぁ", "ィ"=>"ぃ", "ゥ"=>"ぅ", "ェ"=>"ぇ", "ォ"=>"ぉ",
"ッ"=>"っ", "ャ"=>"ゃ", "ュ"=>"ゅ", "ョ"=>"ょ",
"ヴ"=>"う゛", "ヵ"=>"か", "ヶ"=>"が", "ヮ"=>"ゎ"
}
Hira2kataH={}; Kata2hiraH.each_pair{|k,v| Hira2kataH[v]=k}; Hira2kataH["か"]="カ"; Hira2kataH["が"]="ガ"
def kana2rom(str)
s="";str.each_char{|c|if(Kana2romH.key?(c))then s+=Kana2romH[c];else s+=c;end}
s=s.gsub(/(k)([aiueo])(")/,'g\2').gsub(/(s)([aiueo])(")/,'z\2').gsub(/(t)([aiueo])(")/,'d\2')
s=s.gsub(/(h)([aiueo])(")/,'b\2').gsub(/(h)([aiueo])(')/,'p\2').gsub(/u"/,'vu') # [半]濁点゛゜
sw=s;
while nil!=sw.gsub!(/(xtu)([kgszjtdhbpmyrwv])/,'\2\2') do; s=sw; end # ッカ-->xtuka-->kka
# きゃきぃきゅきぇきょ
s=s.gsub(/([kgszjtdnhbpmr])(ixy)([auo])/,'\1y\3') #キャ-->kixya-->kya
s=s.gsub(/([kgszjtdnhbpmr])(ix)([ie])/,'\1y\3') #キィ-->kixi-->kyi
# テャティテュテェテョ, デャディデュデェデョ
s=s.gsub(/([td])(exy)([auo])/,'\1h\3') #テャ texya-->tha
s=s.gsub(/([td])(ex)([ie])/,'\1h\3') # ティ texi-->thi
# かー --> ka- --> kaa. オ− --> oh/oo
s=s.gsub(/(vux)([aieo])/ ,'v\2')#ヴァヴィヴェヴォ, ヴァ-->vuxa-->va
s=s.gsub(/(hux)([aieo])/ ,'f\2')#ファフィフェフォ, ファ-->huxa-->fa
s=s.gsub(/(nn)([kgszjtdhfbpmrwv])/,'n\2').sub(/nn$/,'n')# ン-->nn-->子音の前ではn
return s
end
def rom2kata(str)
result=""; word_buffer=[]; chars=str.each_char.collect{|c| c}
loop{
case word_buffer.size
##### When 0 characters in the buffer
when 0 then
if chars.size>0 then word_buffer.push(chars.shift) else return result; end
##### Patterns with 1 roman character
when 1 then
if word_buffer[0]=~/[aiueo-]/ then result+=Rom2KataH1[word_buffer[0]]; word_buffer=[] # a-->ア
elsif word_buffer[0]=~/[xkcgszjtdnhbpvfmyrlw]/ then
if chars.size>0 then word_buffer.push(chars.shift)
else return result+(word_buffer[0].gsub(/n/,"ン"));
end
else result+=word_buffer.shift;
end
##### Patterns with 2 roman characters
when 2 then
if Rom2KataH2.key?(word_buffer.join) then result+=Rom2KataH2[word_buffer.join]; word_buffer=[];
elsif word_buffer.join=~/([kgszjtcdnhbpmrl]y)|([stcd]h)|ts|(x[wytk])/ then # goto 3
if chars.size>0 then word_buffer.push(chars.shift) # Consume next letter from source array
else return result+(word_buffer.join.gsub(/n/,"ン"));
end
elsif word_buffer[0]=="n" then result+="ン"; word_buffer.shift # nk-->ンk
elsif word_buffer[0]==word_buffer[1] then result+="ッ"; word_buffer.shift # kk-->ッk
else result+=word_buffer.shift;
end
##### Patterns with 3 roman characters
when 3 then
if Rom2KataH3.key?(word_buffer.join) then result+=Rom2KataH3[word_buffer.join]; word_buffer=[];
elsif word_buffer[0]=="n" then result+="ン"; word_buffer.shift;
else result+=word_buffer.shift;
end
end
}
end
def rom2hira(str)
return kata2hira(rom2kata(str))
end
def kata2hira(str)
s=""; str.each_char{|c| s+=( Kata2hiraH.key?(c) ? Kata2hiraH[c] : c )}
return s
end
def hira2kata(str)
s=""; str.each_char{|c|if(Hira2kataH.key?(c))then s+=Hira2kataH[c];else s+=c; end}
return s
end
def kana2kana(str1)
result = []
str2 = Kana2rom::hira2kata(str1)
str3 = Kana2rom::kata2hira(str1)
result << str1
result << str2 if str2.length > 0 and str1 !=str2
result << str3 if str3.length > 0 and str2 !=str3 and str3 != str1
return result
end
module_function :kana2rom, :rom2kata, :kata2hira, :hira2kata, :rom2hira, :kana2kana
end
=begin
### Uncomment this section to test at command line
require 'jcode'
if $0 == __FILE__ then
# sample
str="ひらがな/カタカナ"
printf("ローマ字: %s\n", Kana2rom::kana2rom(str))
printf("平仮名 : %s\n", Kana2rom::kata2hira(str))
printf("片仮名 : %s\n", Kana2rom::hira2kata(str))
str="ro-maji"
printf("片仮名 : %s\n", Kana2rom::rom2kata(str))
printf("平仮名 : %s\nk, Kana2rom::rom2hira(str))
end
=end
Leave a Reply