Занимался одной программой, и обнаружил путаницу кодов в кодировках. Кириллическая буква А в кодировке 1251 это код 128, в Unicode $410, а UTF-8 - пара байт $d0 $90. Интересно, что в некоторых таблицах код UTF-8 показан как шестнадцатиричное число $D090, что не очень-то и верно. Получилась пара процедур для преобразования кодов:
Procedure un2utf(x)
Select x
Case 0 To $7F
y.s=Hex(x,#PB_Ascii)
Case $80 To $7FF
y =Hex( 192 + Int(x/64),#PB_Ascii )+" "+Hex( 128 + x &63,#PB_Ascii )
Case $800 To $FFFF
r = Int(x/64)
y = Hex(224 + Int(r / 64),#PB_Ascii )+" " + Hex(128 + r&63,#PB_Ascii )+" " + Hex(128 + x&63,#PB_Ascii )
Case $10000 To $10FFFF
r = Int(x / 4096)
y = Hex(240 + Int(r / 64),#PB_Ascii )+" "+Hex(128 + r&63,#PB_Ascii )+" "+Hex(128 + Int(x / 64)&64,#PB_Ascii )+" " + Hex(128+x&63,#PB_Ascii )
EndSelect
Debug y
EndProcedure
;1f4b0
;f0 9f 92 b0
un2utf($1f4b0)
un2utf(Asc("А"))
Select x
Case 0 To $7F
y.s=Hex(x,#PB_Ascii)
Case $80 To $7FF
y =Hex( 192 + Int(x/64),#PB_Ascii )+" "+Hex( 128 + x &63,#PB_Ascii )
Case $800 To $FFFF
r = Int(x/64)
y = Hex(224 + Int(r / 64),#PB_Ascii )+" " + Hex(128 + r&63,#PB_Ascii )+" " + Hex(128 + x&63,#PB_Ascii )
Case $10000 To $10FFFF
r = Int(x / 4096)
y = Hex(240 + Int(r / 64),#PB_Ascii )+" "+Hex(128 + r&63,#PB_Ascii )+" "+Hex(128 + Int(x / 64)&64,#PB_Ascii )+" " + Hex(128+x&63,#PB_Ascii )
EndSelect
Debug y
EndProcedure
;1f4b0
;f0 9f 92 b0
un2utf($1f4b0)
un2utf(Asc("А"))
Декодирование UTF-8:
Procedure.s Chr_(v.i) ;return a proper surrogate pair for unicode values outside the BMP (Basic Multilingual Plane)
Protected high, low
If v < $10000
ProcedureReturn Chr(v)
Else
;calculate surrogate pair of unicode codepoints to represent value in UTF-16
v - $10000
high = v / $400 + $D800 ;high/lead surrogate value
low = v % $400 + $DC00 ;low/tail surrogate value
ProcedureReturn Chr(high) + Chr(low)
EndIf
EndProcedure
Procedure.s DecodeUTF8(*s)
i=0
r$=""
;https://writings.sh/post/en/utf8
; bytes | utf8 sequence | codepoint number
; -------+----------------+-------------------------------
; 1 | 1st. 0xxxxxxx | 0xxxxxxx
; -------+----------------+-------------------------------
; 2 | 1st. 110xxxxx | 0xxx xxyyyyyy
; | 2nd. 10yyyyyy |
; -------+----------------+-------------------------------
; 3 | 1st. 1110xxxx | xxxxyyyy yyzzzzzz
; | 2nd. 10yyyyyy |
; | 3rd. 10zzzzzz |
; -------+----------------+-------------------------------
; 4 | 1st. 11110xxx | 000xxxyy yyyyzzzz zzwwwwww
; | 2nd. 10yyyyyy |
; | 3rd. 10zzzzzz |
; | 4th. 10wwwwww |
While PeekA(*s+i)
a.a=PeekA(*s+i)
b.a=PeekA(*s+i+1)
c.a=PeekA(*s+i+2)
d.a=PeekA(*s+i+3)
;https://rosettagit.org/drafts/utf-8-encode-and-decode/
If a&%11110000=%11110000 And b&%10000000=%10000000 And c&%10000000=%10000000 And d&%10000000=%10000000;4
;$1F4B0=128176
co=((a&7)<<18)|((b&%111111)<<12)|((c&%111111)<<6)|(d&%111111)
r$=r$+Chr_(co)
i+4
ElseIf a&%11100000=%11100000 And b&%10000000=%10000000 And c&%10000000=%10000000;3
co=((a&15)<<12)|((b&%111111)<<6)|(c&%111111); euro currency is 8464
r$=r$+Chr(co)
i+3
ElseIf a&%11000000=%11000000 And b&%10000000=%10000000;2
co=((a&%11111)<<6)|(b&%1111111)
r$=r$+Chr(co)
i+2
ElseIf a&$80=0;1
r$=r$+Chr(a&127)
i+1
Else
Debug "err decoding"
End
EndIf
Wend
Debug r$
EndProcedure
Debug DecodeUTF8 (?ex1)
DataSection
ex1:
IncludeBinary"tx_.txt"
Data.u 0
Protected high, low
If v < $10000
ProcedureReturn Chr(v)
Else
;calculate surrogate pair of unicode codepoints to represent value in UTF-16
v - $10000
high = v / $400 + $D800 ;high/lead surrogate value
low = v % $400 + $DC00 ;low/tail surrogate value
ProcedureReturn Chr(high) + Chr(low)
EndIf
EndProcedure
Procedure.s DecodeUTF8(*s)
i=0
r$=""
;https://writings.sh/post/en/utf8
; bytes | utf8 sequence | codepoint number
; -------+----------------+-------------------------------
; 1 | 1st. 0xxxxxxx | 0xxxxxxx
; -------+----------------+-------------------------------
; 2 | 1st. 110xxxxx | 0xxx xxyyyyyy
; | 2nd. 10yyyyyy |
; -------+----------------+-------------------------------
; 3 | 1st. 1110xxxx | xxxxyyyy yyzzzzzz
; | 2nd. 10yyyyyy |
; | 3rd. 10zzzzzz |
; -------+----------------+-------------------------------
; 4 | 1st. 11110xxx | 000xxxyy yyyyzzzz zzwwwwww
; | 2nd. 10yyyyyy |
; | 3rd. 10zzzzzz |
; | 4th. 10wwwwww |
While PeekA(*s+i)
a.a=PeekA(*s+i)
b.a=PeekA(*s+i+1)
c.a=PeekA(*s+i+2)
d.a=PeekA(*s+i+3)
;https://rosettagit.org/drafts/utf-8-encode-and-decode/
If a&%11110000=%11110000 And b&%10000000=%10000000 And c&%10000000=%10000000 And d&%10000000=%10000000;4
;$1F4B0=128176
co=((a&7)<<18)|((b&%111111)<<12)|((c&%111111)<<6)|(d&%111111)
r$=r$+Chr_(co)
i+4
ElseIf a&%11100000=%11100000 And b&%10000000=%10000000 And c&%10000000=%10000000;3
co=((a&15)<<12)|((b&%111111)<<6)|(c&%111111); euro currency is 8464
r$=r$+Chr(co)
i+3
ElseIf a&%11000000=%11000000 And b&%10000000=%10000000;2
co=((a&%11111)<<6)|(b&%1111111)
r$=r$+Chr(co)
i+2
ElseIf a&$80=0;1
r$=r$+Chr(a&127)
i+1
Else
Debug "err decoding"
End
EndIf
Wend
Debug r$
EndProcedure
Debug DecodeUTF8 (?ex1)
DataSection
ex1:
IncludeBinary"tx_.txt"
Data.u 0
Комментарии
Отправить комментарий