UTF-8 normalization

General FreeBASIC programming questions.
Post Reply
lal0qnsc
Posts: 14
Joined: Apr 05, 2022 10:27

UTF-8 normalization

Post by lal0qnsc »

After reading this, I think after an UTF-8 string being normalized the same algorithm used for ASCII strings could be used to implement functions listed there (https://www.freebasic.net/wiki/CatPgString)? Please correct me if I was wrong.

https://stackoverflow.com/questions/793 ... -all-about
lal0qnsc
Posts: 14
Joined: Apr 05, 2022 10:27

Re: UTF-8 normalization

Post by lal0qnsc »

If it's the case, then here you are:

utf8proc.bi

Code: Select all

#pragma once

#include once "crt/stdlib.bi"
#include once "crt/stddef.bi"
#include once "crt/limits.bi"

''utf8proc_decompose renamed to utf8proc_decompose_

extern "C"

const UTF8PROC_VERSION_MAJOR = 2
const UTF8PROC_VERSION_MINOR = 7
const UTF8PROC_VERSION_PATCH = 0

type utf8proc_int8_t as byte
type utf8proc_uint8_t as ubyte
type utf8proc_int16_t as short
type utf8proc_uint16_t as ushort
type utf8proc_int32_t as long
type utf8proc_uint32_t as ulong
type utf8proc_size_t as uinteger
type utf8proc_ssize_t as integer
type utf8proc_bool as boolean

type utf8proc_option_t as long
enum
  UTF8PROC_NULLTERM = 1 shl 0
  UTF8PROC_STABLE = 1 shl 1
  UTF8PROC_COMPAT = 1 shl 2
  UTF8PROC_COMPOSE = 1 shl 3
  UTF8PROC_DECOMPOSE = 1 shl 4
  UTF8PROC_IGNORE = 1 shl 5
  UTF8PROC_REJECTNA = 1 shl 6
  UTF8PROC_NLF2LS = 1 shl 7
  UTF8PROC_NLF2PS = 1 shl 8
  UTF8PROC_NLF2LF = UTF8PROC_NLF2LS or UTF8PROC_NLF2PS
  UTF8PROC_STRIPCC = 1 shl 9
  UTF8PROC_CASEFOLD = 1 shl 10
  UTF8PROC_CHARBOUND = 1 shl 11
  UTF8PROC_LUMP = 1 shl 12
  UTF8PROC_STRIPMARK = 1 shl 13
  UTF8PROC_STRIPNA = 1 shl 14
end enum

const UTF8PROC_ERROR_NOMEM = -1
const UTF8PROC_ERROR_OVERFLOW = -2
const UTF8PROC_ERROR_INVALIDUTF8 = -3
const UTF8PROC_ERROR_NOTASSIGNED = -4
const UTF8PROC_ERROR_INVALIDOPTS = -5
type utf8proc_propval_t as utf8proc_int16_t

type utf8proc_property_struct
  category as utf8proc_propval_t
  combining_class as utf8proc_propval_t
  bidi_class as utf8proc_propval_t
  decomp_type as utf8proc_propval_t
  decomp_seqindex as utf8proc_uint16_t
  casefold_seqindex as utf8proc_uint16_t
  uppercase_seqindex as utf8proc_uint16_t
  lowercase_seqindex as utf8proc_uint16_t
  titlecase_seqindex as utf8proc_uint16_t
  comb_index as utf8proc_uint16_t
  bidi_mirrored : 1 as ulong
  comp_exclusion : 1 as ulong
  ignorable : 1 as ulong
  control_boundary : 1 as ulong
  charwidth : 2 as ulong
  pad : 2 as ulong
  boundclass : 8 as ulong
end type

type utf8proc_property_t as utf8proc_property_struct

type utf8proc_category_t as long
enum
  UTF8PROC_CATEGORY_CN = 0
  UTF8PROC_CATEGORY_LU = 1
  UTF8PROC_CATEGORY_LL = 2
  UTF8PROC_CATEGORY_LT = 3
  UTF8PROC_CATEGORY_LM = 4
  UTF8PROC_CATEGORY_LO = 5
  UTF8PROC_CATEGORY_MN = 6
  UTF8PROC_CATEGORY_MC = 7
  UTF8PROC_CATEGORY_ME = 8
  UTF8PROC_CATEGORY_ND = 9
  UTF8PROC_CATEGORY_NL = 10
  UTF8PROC_CATEGORY_NO = 11
  UTF8PROC_CATEGORY_PC = 12
  UTF8PROC_CATEGORY_PD = 13
  UTF8PROC_CATEGORY_PS = 14
  UTF8PROC_CATEGORY_PE = 15
  UTF8PROC_CATEGORY_PI = 16
  UTF8PROC_CATEGORY_PF = 17
  UTF8PROC_CATEGORY_PO = 18
  UTF8PROC_CATEGORY_SM = 19
  UTF8PROC_CATEGORY_SC = 20
  UTF8PROC_CATEGORY_SK = 21
  UTF8PROC_CATEGORY_SO = 22
  UTF8PROC_CATEGORY_ZS = 23
  UTF8PROC_CATEGORY_ZL = 24
  UTF8PROC_CATEGORY_ZP = 25
  UTF8PROC_CATEGORY_CC = 26
  UTF8PROC_CATEGORY_CF = 27
  UTF8PROC_CATEGORY_CS = 28
  UTF8PROC_CATEGORY_CO = 29
end enum

type utf8proc_bidi_class_t as long
enum
  UTF8PROC_BIDI_CLASS_L = 1
  UTF8PROC_BIDI_CLASS_LRE = 2
  UTF8PROC_BIDI_CLASS_LRO = 3
  UTF8PROC_BIDI_CLASS_R = 4
  UTF8PROC_BIDI_CLASS_AL = 5
  UTF8PROC_BIDI_CLASS_RLE = 6
  UTF8PROC_BIDI_CLASS_RLO = 7
  UTF8PROC_BIDI_CLASS_PDF = 8
  UTF8PROC_BIDI_CLASS_EN = 9
  UTF8PROC_BIDI_CLASS_ES = 10
  UTF8PROC_BIDI_CLASS_ET = 11
  UTF8PROC_BIDI_CLASS_AN = 12
  UTF8PROC_BIDI_CLASS_CS = 13
  UTF8PROC_BIDI_CLASS_NSM = 14
  UTF8PROC_BIDI_CLASS_BN = 15
  UTF8PROC_BIDI_CLASS_B = 16
  UTF8PROC_BIDI_CLASS_S = 17
  UTF8PROC_BIDI_CLASS_WS = 18
  UTF8PROC_BIDI_CLASS_ON = 19
  UTF8PROC_BIDI_CLASS_LRI = 20
  UTF8PROC_BIDI_CLASS_RLI = 21
  UTF8PROC_BIDI_CLASS_FSI = 22
  UTF8PROC_BIDI_CLASS_PDI = 23
end enum

type utf8proc_decomp_type_t as long
enum
  UTF8PROC_DECOMP_TYPE_FONT = 1
  UTF8PROC_DECOMP_TYPE_NOBREAK = 2
  UTF8PROC_DECOMP_TYPE_INITIAL = 3
  UTF8PROC_DECOMP_TYPE_MEDIAL = 4
  UTF8PROC_DECOMP_TYPE_FINAL = 5
  UTF8PROC_DECOMP_TYPE_ISOLATED = 6
  UTF8PROC_DECOMP_TYPE_CIRCLE = 7
  UTF8PROC_DECOMP_TYPE_SUPER = 8
  UTF8PROC_DECOMP_TYPE_SUB = 9
  UTF8PROC_DECOMP_TYPE_VERTICAL = 10
  UTF8PROC_DECOMP_TYPE_WIDE = 11
  UTF8PROC_DECOMP_TYPE_NARROW = 12
  UTF8PROC_DECOMP_TYPE_SMALL = 13
  UTF8PROC_DECOMP_TYPE_SQUARE = 14
  UTF8PROC_DECOMP_TYPE_FRACTION = 15
  UTF8PROC_DECOMP_TYPE_COMPAT = 16
end enum

type utf8proc_boundclass_t as long
enum
  UTF8PROC_BOUNDCLASS_START = 0
  UTF8PROC_BOUNDCLASS_OTHER = 1
  UTF8PROC_BOUNDCLASS_CR = 2
  UTF8PROC_BOUNDCLASS_LF = 3
  UTF8PROC_BOUNDCLASS_CONTROL = 4
  UTF8PROC_BOUNDCLASS_EXTEND = 5
  UTF8PROC_BOUNDCLASS_L = 6
  UTF8PROC_BOUNDCLASS_V = 7
  UTF8PROC_BOUNDCLASS_T = 8
  UTF8PROC_BOUNDCLASS_LV = 9
  UTF8PROC_BOUNDCLASS_LVT = 10
  UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11
  UTF8PROC_BOUNDCLASS_SPACINGMARK = 12
  UTF8PROC_BOUNDCLASS_PREPEND = 13
  UTF8PROC_BOUNDCLASS_ZWJ = 14
  UTF8PROC_BOUNDCLASS_E_BASE = 15
  UTF8PROC_BOUNDCLASS_E_MODIFIER = 16
  UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17
  UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18
  UTF8PROC_BOUNDCLASS_EXTENDED_PICTOGRAPHIC = 19
  UTF8PROC_BOUNDCLASS_E_ZWG = 20
end enum

type utf8proc_custom_func as function(byval codepoint as utf8proc_int32_t, byval data as any ptr) as utf8proc_int32_t

#ifdef __FB_WIN32__
  extern import utf8proc_utf8class(0 to 255) as const utf8proc_int8_t
#else
  extern utf8proc_utf8class(0 to 255) as const utf8proc_int8_t
#endif

declare function utf8proc_version() as const zstring ptr
declare function utf8proc_unicode_version() as const zstring ptr
declare function utf8proc_errmsg(byval errcode as utf8proc_ssize_t) as const zstring ptr
declare function utf8proc_iterate(byval str_ as const utf8proc_uint8_t ptr, byval strlen as utf8proc_ssize_t, byval codepoint_ref as utf8proc_int32_t ptr) as utf8proc_ssize_t
declare function utf8proc_codepoint_valid(byval codepoint as utf8proc_int32_t) as utf8proc_bool
declare function utf8proc_encode_char(byval codepoint as utf8proc_int32_t, byval dst as utf8proc_uint8_t ptr) as utf8proc_ssize_t
declare function utf8proc_get_property(byval codepoint as utf8proc_int32_t) as const utf8proc_property_t ptr
declare function utf8proc_decompose_char(byval codepoint as utf8proc_int32_t, byval dst as utf8proc_int32_t ptr, byval bufsize as utf8proc_ssize_t, byval options as utf8proc_option_t, byval last_boundclass as long ptr) as utf8proc_ssize_t
declare function utf8proc_decompose_(byval str_ as const utf8proc_uint8_t ptr, byval strlen as utf8proc_ssize_t, byval buffer as utf8proc_int32_t ptr, byval bufsize as utf8proc_ssize_t, byval options as utf8proc_option_t) as utf8proc_ssize_t
declare function utf8proc_decompose_custom(byval str_ as const utf8proc_uint8_t ptr, byval strlen as utf8proc_ssize_t, byval buffer as utf8proc_int32_t ptr, byval bufsize as utf8proc_ssize_t, byval options as utf8proc_option_t, byval custom_func as utf8proc_custom_func, byval custom_data as any ptr) as utf8proc_ssize_t
declare function utf8proc_normalize_utf32(byval buffer as utf8proc_int32_t ptr, byval length as utf8proc_ssize_t, byval options as utf8proc_option_t) as utf8proc_ssize_t
declare function utf8proc_reencode(byval buffer as utf8proc_int32_t ptr, byval length as utf8proc_ssize_t, byval options as utf8proc_option_t) as utf8proc_ssize_t
declare function utf8proc_grapheme_break_stateful(byval codepoint1 as utf8proc_int32_t, byval codepoint2 as utf8proc_int32_t, byval state as utf8proc_int32_t ptr) as utf8proc_bool
declare function utf8proc_grapheme_break(byval codepoint1 as utf8proc_int32_t, byval codepoint2 as utf8proc_int32_t) as utf8proc_bool
declare function utf8proc_tolower(byval c as utf8proc_int32_t) as utf8proc_int32_t
declare function utf8proc_toupper(byval c as utf8proc_int32_t) as utf8proc_int32_t
declare function utf8proc_totitle(byval c as utf8proc_int32_t) as utf8proc_int32_t
declare function utf8proc_islower(byval c as utf8proc_int32_t) as long
declare function utf8proc_isupper(byval c as utf8proc_int32_t) as long
declare function utf8proc_charwidth(byval codepoint as utf8proc_int32_t) as long
declare function utf8proc_category(byval codepoint as utf8proc_int32_t) as utf8proc_category_t
declare function utf8proc_category_string(byval codepoint as utf8proc_int32_t) as const zstring ptr
declare function utf8proc_map(byval str_ as const utf8proc_uint8_t ptr, byval strlen as utf8proc_ssize_t, byval dstptr as utf8proc_uint8_t ptr ptr, byval options as utf8proc_option_t) as utf8proc_ssize_t
declare function utf8proc_map_custom(byval str_ as const utf8proc_uint8_t ptr, byval strlen as utf8proc_ssize_t, byval dstptr as utf8proc_uint8_t ptr ptr, byval options as utf8proc_option_t, byval custom_func as utf8proc_custom_func, byval custom_data as any ptr) as utf8proc_ssize_t
declare function utf8proc_NFD(byval str_ as const utf8proc_uint8_t ptr) as utf8proc_uint8_t ptr
declare function utf8proc_NFC(byval str_ as const utf8proc_uint8_t ptr) as utf8proc_uint8_t ptr
declare function utf8proc_NFKD(byval str_ as const utf8proc_uint8_t ptr) as utf8proc_uint8_t ptr
declare function utf8proc_NFKC(byval str_ as const utf8proc_uint8_t ptr) as utf8proc_uint8_t ptr
declare function utf8proc_NFKC_Casefold(byval str_ as const utf8proc_uint8_t ptr) as utf8proc_uint8_t ptr

end extern
https://github.com/JuliaStrings/utf8pro ... tag/v2.7.0

Hope FreeBASIC could have full UTF-8 support soon.

p/s: it seems Free Pascal also uses utf8proc (or used to use it). utf8proc is widely used outside of the Julia language.
Post Reply