HomeUniDoc
...

Package textencoding

Overview ▾

Package textencoding is used for handling text encoding (char code <-> glyph mapping) in unidoc both for reading and outputing PDF contents.

Index ▾

Constants
func ExpandLigatures(runes []rune) string
func FromFontDifferences(diffList *core.PdfObjectArray) (map[CharCode]GlyphName, error)
func GlyphToRune(glyph GlyphName) (rune, bool)
func RegisterSimpleEncoding(name string, fnc func() SimpleEncoder)
func RuneToString(r rune) string
type CMapEncoder
    func NewCMapEncoder(baseName string, codeToCID, cidToUnicode *cmap.CMap) CMapEncoder
    func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool)
    func (enc CMapEncoder) Decode(raw []byte) string
    func (enc CMapEncoder) Encode(str string) []byte
    func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool)
    func (enc CMapEncoder) String() string
    func (enc CMapEncoder) ToPdfObject() core.PdfObject
type CharCode
type GID
type GlyphName
    func RuneToGlyph(r rune) (GlyphName, bool)
type IdentityEncoder
    func NewIdentityTextEncoder(baseName string) *IdentityEncoder
    func (enc *IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool)
    func (enc *IdentityEncoder) Decode(raw []byte) string
    func (enc *IdentityEncoder) Encode(str string) []byte
    func (enc *IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool)
    func (enc *IdentityEncoder) RegisteredRunes() []rune
    func (enc *IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool)
    func (enc *IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool)
    func (enc *IdentityEncoder) String() string
    func (enc *IdentityEncoder) ToPdfObject() core.PdfObject
type SimpleEncoder
    func ApplyDifferences(base SimpleEncoder, differences map[CharCode]GlyphName) SimpleEncoder
    func NewCustomSimpleTextEncoder(encoding, differences map[CharCode]GlyphName) (SimpleEncoder, error)
    func NewMacExpertEncoder() SimpleEncoder
    func NewMacRomanEncoder() SimpleEncoder
    func NewPdfDocEncoder() SimpleEncoder
    func NewSimpleTextEncoder(baseName string, differences map[CharCode]GlyphName) (SimpleEncoder, error)
    func NewStandardEncoder() SimpleEncoder
    func NewSymbolEncoder() SimpleEncoder
    func NewWinAnsiEncoder() SimpleEncoder
    func NewZapfDingbatsEncoder() SimpleEncoder
type TextEncoder
type TrueTypeFontEncoder
    func NewTrueTypeFontEncoder(runeToGIDMap map[rune]GID) *TrueTypeFontEncoder
    func (enc *TrueTypeFontEncoder) CharcodeToRune(code CharCode) (rune, bool)
    func (enc *TrueTypeFontEncoder) Decode(raw []byte) string
    func (enc *TrueTypeFontEncoder) Encode(str string) []byte
    func (enc *TrueTypeFontEncoder) GlyphToCharcode(glyph GlyphName) (CharCode, bool)
    func (enc *TrueTypeFontEncoder) RegisteredRunes() []rune
    func (enc *TrueTypeFontEncoder) RuneToCharcode(r rune) (CharCode, bool)
    func (enc *TrueTypeFontEncoder) String() string
    func (enc *TrueTypeFontEncoder) SubsetRegistered()
    func (enc *TrueTypeFontEncoder) ToPdfObject() core.PdfObject
type UTF16Encoder
    func NewUTF16TextEncoder(baseName string) UTF16Encoder
    func (enc UTF16Encoder) CharcodeToRune(code CharCode) (rune, bool)
    func (enc UTF16Encoder) Decode(raw []byte) string
    func (enc UTF16Encoder) Encode(str string) []byte
    func (enc UTF16Encoder) RuneToCharcode(r rune) (CharCode, bool)
    func (enc UTF16Encoder) String() string
    func (enc UTF16Encoder) ToPdfObject() core.PdfObject

Package files

charglyph_map.go cmap.go differences.go doc.go encoder.go glyphs_glyphlist.go identity.go simple.go simple_encoders_map.go simple_mac.go simple_other.go simple_pdf.go simple_winansi.go truetype.go utf16.go utils.go

Constants

MissingCodeRune is the rune returned when there is no matching glyph. It was previously '?'.

const (
    // MissingCodeRune replaces runes that can't be decoded. .
    MissingCodeRune = '\ufffd' // �

    // MissingCodeString replaces strings that can't be decoded.
    MissingCodeString = string(MissingCodeRune)
)

func ExpandLigatures

func ExpandLigatures(runes []rune) string

ExpandLigatures returns `runes` as a string with ligatures expanded

func FromFontDifferences

func FromFontDifferences(diffList *core.PdfObjectArray) (map[CharCode]GlyphName, error)

FromFontDifferences converts `diffList` (a /Differences array from an /Encoding object) to a map representing character code to glyph mappings.

func GlyphToRune

func GlyphToRune(glyph GlyphName) (rune, bool)

GlyphToRune returns the rune corresponding to glyph `glyph` if there is one. TODO: Can we return a string here? e.g. When we are extracting text, we want to get "ffi"

rather than 'ffi'. We only need a glyph ➞ rune map when we need to convert back to
glyphs.
We are currently applying RuneToString to the output of functions that call
GlyphToRune. While this gives the same result, it makes the calling code complex and
fragile.

TODO: Can we combine all the tables glyphAliases, glyphlistGlyphToRuneMap,

texGlyphlistGlyphToStringMap, additionalGlyphlistGlyphToRuneMap and ".notdef"?

func RegisterSimpleEncoding

func RegisterSimpleEncoding(name string, fnc func() SimpleEncoder)

RegisterSimpleEncoding registers a SimpleEncoder constructer by PDF encoding name.

func RuneToString

func RuneToString(r rune) string

RuneToString converts rune `r` to a string. It unpacks `ligatures`.

type CMapEncoder

CMapEncoder encodes/decodes strings based on CMap mappings.

type CMapEncoder struct {
    // contains filtered or unexported fields
}

func NewCMapEncoder

func NewCMapEncoder(baseName string, codeToCID, cidToUnicode *cmap.CMap) CMapEncoder

NewCMapEncoder returns a new CMapEncoder based on the predefined encoding `baseName`. If `codeToCID` is nil, Identity encoding is assumed. `cidToUnicode` must not be nil.

func (CMapEncoder) CharcodeToRune

func (enc CMapEncoder) CharcodeToRune(code CharCode) (rune, bool)

CharcodeToRune converts PDF character code `code` to a rune. The bool return flag is true if there was a match, and false otherwise.

func (CMapEncoder) Decode

func (enc CMapEncoder) Decode(raw []byte) string

Decode converts PDF encoded string to a Go unicode string.

func (CMapEncoder) Encode

func (enc CMapEncoder) Encode(str string) []byte

Encode converts the Go unicode string to a PDF encoded string.

func (CMapEncoder) RuneToCharcode

func (enc CMapEncoder) RuneToCharcode(r rune) (CharCode, bool)

RuneToCharcode converts rune `r` to a PDF character code. The bool return flag is true if there was a match, and false otherwise.

func (CMapEncoder) String

func (enc CMapEncoder) String() string

String returns a string that describes `enc`.

func (CMapEncoder) ToPdfObject

func (enc CMapEncoder) ToPdfObject() core.PdfObject

ToPdfObject returns a PDF Object that represents the encoding.

type CharCode

CharCode is a character code used in the specific encoding.

type CharCode uint16

type GID

GID is a glyph index.

type GID uint16

type GlyphName

GlyphName is a name of a glyph.

type GlyphName string

func RuneToGlyph

func RuneToGlyph(r rune) (GlyphName, bool)

RuneToGlyph is the reverse of the table lookups in GlyphToRune.

type IdentityEncoder

IdentityEncoder represents an 2-byte identity encoding. NOTE: In many cases this is just used to encode/decode to glyph index and does not have a unicode

meaning, except via the ToUnicode maps.

TODO: The use of runes as indicators for glyph indices and not-utf8 runes is not good and confusing.

Might be better to combine the Identity encoder with a ToUnicode map and keep track of the actual
runes and character codes, CMaps together.
type IdentityEncoder struct {
    // contains filtered or unexported fields
}

func NewIdentityTextEncoder

func NewIdentityTextEncoder(baseName string) *IdentityEncoder

NewIdentityTextEncoder returns a new IdentityEncoder based on predefined encoding `baseName` and difference map `differences`.

func (*IdentityEncoder) CharcodeToRune

func (enc *IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool)

CharcodeToRune converts PDF character code `code` to a rune. The bool return flag is true if there was a match, and false otherwise. TODO: Here the `r` is not necessarily an actual rune but a glyph index (unless both).

func (*IdentityEncoder) Decode

func (enc *IdentityEncoder) Decode(raw []byte) string

Decode converts PDF encoded string to a Go unicode string.

func (*IdentityEncoder) Encode

func (enc *IdentityEncoder) Encode(str string) []byte

Encode converts the Go unicode string to a PDF encoded string.

func (*IdentityEncoder) GlyphToRune

func (enc *IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool)

GlyphToRune returns the rune corresponding to glyph name `glyph`. The bool return flag is true if there was a match, and false otherwise.

func (*IdentityEncoder) RegisteredRunes

func (enc *IdentityEncoder) RegisteredRunes() []rune

RegisteredRunes returns the slice of runes that have been registered as used by the encoder.

func (*IdentityEncoder) RuneToCharcode

func (enc *IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool)

RuneToCharcode converts rune `r` to a PDF character code. The bool return flag is true if there was a match, and false otherwise. TODO: Here the `r` is an actual rune.

func (*IdentityEncoder) RuneToGlyph

func (enc *IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool)

RuneToGlyph returns the glyph name for rune `r`. The bool return flag is true if there was a match, and false otherwise.

func (*IdentityEncoder) String

func (enc *IdentityEncoder) String() string

String returns a string that describes `enc`.

func (*IdentityEncoder) ToPdfObject

func (enc *IdentityEncoder) ToPdfObject() core.PdfObject

ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.

type SimpleEncoder

SimpleEncoder represents a 1 byte encoding.

type SimpleEncoder interface {
    TextEncoder
    BaseName() string
    Charcodes() []CharCode
}

func ApplyDifferences

func ApplyDifferences(base SimpleEncoder, differences map[CharCode]GlyphName) SimpleEncoder

ApplyDifferences modifies or wraps the base encoding and overlays differences over it.

func NewCustomSimpleTextEncoder

func NewCustomSimpleTextEncoder(encoding, differences map[CharCode]GlyphName) (SimpleEncoder, error)

NewCustomSimpleTextEncoder returns a simpleEncoder based on map `encoding` and difference map `differences`.

func NewMacExpertEncoder

func NewMacExpertEncoder() SimpleEncoder

NewMacExpertEncoder returns a SimpleEncoder that implements MacExpertEncoding.

func NewMacRomanEncoder

func NewMacRomanEncoder() SimpleEncoder

NewMacRomanEncoder returns a SimpleEncoder that implements MacRomanEncoding.

func NewPdfDocEncoder

func NewPdfDocEncoder() SimpleEncoder

NewPdfDocEncoder returns a SimpleEncoder that implements PdfDocEncoding.

func NewSimpleTextEncoder

func NewSimpleTextEncoder(baseName string, differences map[CharCode]GlyphName) (SimpleEncoder, error)

NewSimpleTextEncoder returns a simpleEncoder based on predefined encoding `baseName` and difference map `differences`.

func NewStandardEncoder

func NewStandardEncoder() SimpleEncoder

NewStandardEncoder returns a SimpleEncoder that implements StandardEncoding.

func NewSymbolEncoder

func NewSymbolEncoder() SimpleEncoder

NewSymbolEncoder returns a SimpleEncoder that implements SymbolEncoding.

func NewWinAnsiEncoder

func NewWinAnsiEncoder() SimpleEncoder

NewWinAnsiEncoder returns a simpleEncoder that implements WinAnsiEncoding.

func NewZapfDingbatsEncoder

func NewZapfDingbatsEncoder() SimpleEncoder

NewZapfDingbatsEncoder returns a SimpleEncoder that implements ZapfDingbatsEncoding.

type TextEncoder

TextEncoder defines the common methods that a text encoder implementation must have in UniDoc.

type TextEncoder interface {
    // String returns a string that describes the TextEncoder instance.
    String() string

    // Encode converts the Go unicode string to a PDF encoded string.
    Encode(str string) []byte

    // Decode converts PDF encoded string to a Go unicode string.
    Decode(raw []byte) string

    // RuneToCharcode returns the PDF character code corresponding to rune `r`.
    // The bool return flag is true if there was a match, and false otherwise.
    // This is usually implemented as RuneToGlyph->GlyphToCharcode
    RuneToCharcode(r rune) (CharCode, bool)

    // CharcodeToRune returns the rune corresponding to character code `code`.
    // The bool return flag is true if there was a match, and false otherwise.
    // This is usually implemented as CharcodeToGlyph->GlyphToRune
    CharcodeToRune(code CharCode) (rune, bool)

    // ToPdfObject returns a PDF Object that represents the encoding.
    ToPdfObject() core.PdfObject
}

type TrueTypeFontEncoder

TrueTypeFontEncoder handles text encoding for composite TrueType fonts. It performs mapping between character ids and glyph ids. It has a preloaded rune (unicode code point) to glyph index map that has been loaded from a font. Corresponds to Identity-H CMap and Identity encoding.

type TrueTypeFontEncoder struct {
    // contains filtered or unexported fields
}

func NewTrueTypeFontEncoder

func NewTrueTypeFontEncoder(runeToGIDMap map[rune]GID) *TrueTypeFontEncoder

NewTrueTypeFontEncoder creates a new text encoder for TTF fonts with a runeToGlyphIndexMap that has been preloaded from the font file. The new instance is preloaded with a CMapIdentityH (Identity-H) CMap which maps 2-byte charcodes to CIDs (glyph index).

func (*TrueTypeFontEncoder) CharcodeToRune

func (enc *TrueTypeFontEncoder) CharcodeToRune(code CharCode) (rune, bool)

CharcodeToRune converts PDF character code `code` to a rune. The bool return flag is true if there was a match, and false otherwise.

func (*TrueTypeFontEncoder) Decode

func (enc *TrueTypeFontEncoder) Decode(raw []byte) string

Decode converts PDF encoded string to a Go unicode string.

func (*TrueTypeFontEncoder) Encode

func (enc *TrueTypeFontEncoder) Encode(str string) []byte

Encode converts the Go unicode string to a PDF encoded string.

func (*TrueTypeFontEncoder) GlyphToCharcode

func (enc *TrueTypeFontEncoder) GlyphToCharcode(glyph GlyphName) (CharCode, bool)

GlyphToCharcode returns character code matching the glyph name `glyph`. The bool return flag is true if there was a match, and false otherwise.

func (*TrueTypeFontEncoder) RegisteredRunes

func (enc *TrueTypeFontEncoder) RegisteredRunes() []rune

RegisteredRunes returns the slice of runes that have been registered as used by the encoder.

func (*TrueTypeFontEncoder) RuneToCharcode

func (enc *TrueTypeFontEncoder) RuneToCharcode(r rune) (CharCode, bool)

RuneToCharcode converts rune `r` to a PDF character code. The bool return flag is true if there was a match, and false otherwise.

func (*TrueTypeFontEncoder) String

func (enc *TrueTypeFontEncoder) String() string

String returns a string that describes `enc`.

func (*TrueTypeFontEncoder) SubsetRegistered

func (enc *TrueTypeFontEncoder) SubsetRegistered()

SubsetRegistered subsets `enc` to only registered runes (that have been registered via encoding). NOTE: Make sure to call this soon before writing (once all needed runes have been registered).

func (*TrueTypeFontEncoder) ToPdfObject

func (enc *TrueTypeFontEncoder) ToPdfObject() core.PdfObject

ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.

type UTF16Encoder

UTF16Encoder represents UTF-16 encoding.

type UTF16Encoder struct {
    // contains filtered or unexported fields
}

func NewUTF16TextEncoder

func NewUTF16TextEncoder(baseName string) UTF16Encoder

NewUTF16TextEncoder returns a new UTF16Encoder based on the predefined encoding `baseName`.

func (UTF16Encoder) CharcodeToRune

func (enc UTF16Encoder) CharcodeToRune(code CharCode) (rune, bool)

CharcodeToRune converts PDF character code `code` to a rune. The bool return flag is true if there was a match, and false otherwise.

func (UTF16Encoder) Decode

func (enc UTF16Encoder) Decode(raw []byte) string

Decode converts PDF encoded string to a Go unicode string.

func (UTF16Encoder) Encode

func (enc UTF16Encoder) Encode(str string) []byte

Encode converts the Go unicode string to a PDF encoded string.

func (UTF16Encoder) RuneToCharcode

func (enc UTF16Encoder) RuneToCharcode(r rune) (CharCode, bool)

RuneToCharcode converts rune `r` to a PDF character code. The bool return flag is true if there was a match, and false otherwise.

func (UTF16Encoder) String

func (enc UTF16Encoder) String() string

String returns a string that describes `enc`.

func (UTF16Encoder) ToPdfObject

func (enc UTF16Encoder) ToPdfObject() core.PdfObject

ToPdfObject returns a PDF Object that represents the encoding.

Subdirectories

Name Synopsis
..