NeoMutt  2024-04-25-76-g20fe7b
Teaching an old dog new tricks
DOXYGEN
Loading...
Searching...
No Matches
charset.h File Reference

Conversion between different character encodings. More...

#include <iconv.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <wchar.h>
+ Include dependency graph for charset.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Data Structures

struct  FgetConv
 Cursor for converting a file's encoding. More...
 
struct  FgetConvNot
 A dummy converter. More...
 

Macros

#define MUTT_ICONV_NO_FLAGS   0
 No flags are set.
 
#define MUTT_ICONV_HOOK_FROM   1
 apply charset-hooks to fromcode
 
#define mutt_ch_is_utf8(str)   mutt_ch_chscmp(str, "utf-8")
 
#define mutt_ch_is_us_ascii(str)   mutt_ch_chscmp(str, "us-ascii")
 
#define ICONV_T_INVALID   ((iconv_t) -1)
 Error value for iconv functions.
 
#define ICONV_ILLEGAL_SEQ   ((size_t) -1)
 Error value for iconv() - Illegal sequence.
 
#define ICONV_BUF_TOO_SMALL   ((size_t) -2)
 Error value for iconv() - Buffer too small.
 

Enumerations

enum  LookupType { MUTT_LOOKUP_CHARSET , MUTT_LOOKUP_ICONV }
 Types of character set lookups. More...
 

Functions

void mutt_ch_canonical_charset (char *buf, size_t buflen, const char *name)
 Canonicalise the charset of a string.
 
const char * mutt_ch_charset_lookup (const char *chs)
 Look for a replacement character set.
 
int mutt_ch_check (const char *s, size_t slen, const char *from, const char *to)
 Check whether a string can be converted between encodings.
 
bool mutt_ch_check_charset (const char *cs, bool strict)
 Does iconv understand a character set?
 
char * mutt_ch_choose (const char *fromcode, const struct Slist *charsets, const char *u, size_t ulen, char **d, size_t *dlen)
 Figure the best charset to encode a string.
 
bool mutt_ch_chscmp (const char *cs1, const char *cs2)
 Are the names of two character sets equivalent?
 
int mutt_ch_convert_nonmime_string (const struct Slist *const assumed_charset, const char *charset, char **ps)
 Try to convert a string using a list of character sets.
 
int mutt_ch_convert_string (char **ps, const char *from, const char *to, uint8_t flags)
 Convert a string between encodings.
 
int mutt_ch_fgetconv (struct FgetConv *fc)
 Convert a file's character set.
 
void mutt_ch_fgetconv_close (struct FgetConv **ptr)
 Close an fgetconv handle.
 
struct FgetConvmutt_ch_fgetconv_open (FILE *fp, const char *from, const char *to, uint8_t flags)
 Prepare a file for charset conversion.
 
char * mutt_ch_fgetconvs (char *buf, size_t buflen, struct FgetConv *fc)
 Convert a file's charset into a string buffer.
 
const char * mutt_ch_get_default_charset (const struct Slist *const assumed_charset)
 Get the default character set.
 
char * mutt_ch_get_langinfo_charset (void)
 Get the user's choice of character set.
 
size_t mutt_ch_iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
 Change the encoding of a string.
 
const char * mutt_ch_iconv_lookup (const char *chs)
 Look for a replacement character set.
 
iconv_t mutt_ch_iconv_open (const char *tocode, const char *fromcode, uint8_t flags)
 Set up iconv for conversions.
 
bool mutt_ch_lookup_add (enum LookupType type, const char *pat, const char *replace, struct Buffer *err)
 Add a new character set lookup.
 
void mutt_ch_lookup_remove (void)
 Remove all the character set lookups.
 
void mutt_ch_set_charset (const char *charset)
 Update the records for a new character set.
 
void mutt_ch_cache_cleanup (void)
 Clean up the cached iconv handles and charset strings.
 
static bool iconv_t_valid (const iconv_t cd)
 Is the conversion descriptor valid?
 

Variables

bool CharsetIsUtf8
 Is the user's current character set utf-8?
 
wchar_t ReplacementChar
 When a Unicode character can't be displayed, use this instead.
 

Detailed Description

Conversion between different character encodings.

Authors
  • Richard Russon
  • Pietro Cerutti

This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/.

Definition in file charset.h.

Macro Definition Documentation

◆ MUTT_ICONV_NO_FLAGS

#define MUTT_ICONV_NO_FLAGS   0

No flags are set.

Definition at line 73 of file charset.h.

◆ MUTT_ICONV_HOOK_FROM

#define MUTT_ICONV_HOOK_FROM   1

apply charset-hooks to fromcode

Definition at line 74 of file charset.h.

◆ mutt_ch_is_utf8

#define mutt_ch_is_utf8 (   str)    mutt_ch_chscmp(str, "utf-8")

Definition at line 98 of file charset.h.

◆ mutt_ch_is_us_ascii

#define mutt_ch_is_us_ascii (   str)    mutt_ch_chscmp(str, "us-ascii")

Definition at line 99 of file charset.h.

◆ ICONV_T_INVALID

#define ICONV_T_INVALID   ((iconv_t) -1)

Error value for iconv functions.

Definition at line 102 of file charset.h.

◆ ICONV_ILLEGAL_SEQ

#define ICONV_ILLEGAL_SEQ   ((size_t) -1)

Error value for iconv() - Illegal sequence.

Definition at line 105 of file charset.h.

◆ ICONV_BUF_TOO_SMALL

#define ICONV_BUF_TOO_SMALL   ((size_t) -2)

Error value for iconv() - Buffer too small.

Definition at line 107 of file charset.h.

Enumeration Type Documentation

◆ LookupType

enum LookupType

Types of character set lookups.

Enumerator
MUTT_LOOKUP_CHARSET 

Alias for another character set.

MUTT_LOOKUP_ICONV 

Character set conversion.

Definition at line 67 of file charset.h.

68{
71};
@ MUTT_LOOKUP_ICONV
Character set conversion.
Definition: charset.h:70
@ MUTT_LOOKUP_CHARSET
Alias for another character set.
Definition: charset.h:69

Function Documentation

◆ mutt_ch_canonical_charset()

void mutt_ch_canonical_charset ( char *  buf,
size_t  buflen,
const char *  name 
)

Canonicalise the charset of a string.

Parameters
bufBuffer for canonical character set name
buflenLength of buffer
nameName to be canonicalised

This first ties off any charset extension such as "//TRANSLIT", canonicalizes the charset and re-adds the extension

Definition at line 374 of file charset.c.

375{
376 if (!buf || !name)
377 return;
378
379 char in[1024] = { 0 };
380 char scratch[1024 + 10] = { 0 };
381 struct Buffer *canon = buf_pool_get();
382
383 mutt_str_copy(in, name, sizeof(in));
384 char *ext = strchr(in, '/');
385 if (ext)
386 *ext++ = '\0';
387
388 if (mutt_istr_equal(in, "utf-8") || mutt_istr_equal(in, "utf8"))
389 {
390 buf_strcpy(canon, "utf-8");
391 goto out;
392 }
393
394 /* catch some common iso-8859-something misspellings */
395 size_t plen;
396 if ((plen = mutt_istr_startswith(in, "8859")) && (in[plen] != '-'))
397 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
398 else if ((plen = mutt_istr_startswith(in, "8859-")))
399 snprintf(scratch, sizeof(scratch), "iso-8859-%s", in + plen);
400 else if ((plen = mutt_istr_startswith(in, "iso8859")) && (in[plen] != '-'))
401 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
402 else if ((plen = mutt_istr_startswith(in, "iso8859-")))
403 snprintf(scratch, sizeof(scratch), "iso_8859-%s", in + plen);
404 else
405 mutt_str_copy(scratch, in, sizeof(scratch));
406
407 for (size_t i = 0; PreferredMimeNames[i].key; i++)
408 {
409 if (mutt_istr_equal(scratch, PreferredMimeNames[i].key))
410 {
411 buf_strcpy(canon, PreferredMimeNames[i].pref);
412 goto out;
413 }
414 }
415
416 buf_strcpy(canon, scratch);
417 buf_lower(canon); // for cosmetics' sake
418
419out:
420 if (ext && (*ext != '\0'))
421 {
422 buf_addch(canon, '/');
423 buf_addstr(canon, ext);
424 }
425
426 mutt_str_copy(buf, buf_string(canon), buflen);
427 buf_pool_release(&canon);
428}
size_t buf_addch(struct Buffer *buf, char c)
Add a single character to a Buffer.
Definition: buffer.c:241
size_t buf_addstr(struct Buffer *buf, const char *s)
Add a string to a Buffer.
Definition: buffer.c:226
size_t buf_strcpy(struct Buffer *buf, const char *s)
Copy a string into a Buffer.
Definition: buffer.c:395
void buf_lower(struct Buffer *buf)
Sets a buffer to lowercase.
Definition: buffer.c:736
static const char * buf_string(const struct Buffer *buf)
Convert a buffer to a const char * "string".
Definition: buffer.h:96
static const struct MimeNames PreferredMimeNames[]
Lookup table of preferred charsets.
Definition: charset.c:121
bool mutt_istr_equal(const char *a, const char *b)
Compare two strings, ignoring case.
Definition: string.c:672
size_t mutt_str_copy(char *dest, const char *src, size_t dsize)
Copy a string into a buffer (guaranteeing NUL-termination)
Definition: string.c:581
size_t mutt_istr_startswith(const char *str, const char *prefix)
Check whether a string starts with a prefix, ignoring case.
Definition: string.c:242
struct Buffer * buf_pool_get(void)
Get a Buffer from the pool.
Definition: pool.c:81
void buf_pool_release(struct Buffer **ptr)
Return a Buffer to the pool.
Definition: pool.c:94
String manipulation buffer.
Definition: buffer.h:36
const char * key
Definition: charset.c:107
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_charset_lookup()

const char * mutt_ch_charset_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'charset-hook' matches)
NULLNo matching hook

Look through all the 'charset-hook's. If one matches return the replacement character set.

Definition at line 562 of file charset.c.

563{
565}
static const char * lookup_charset(enum LookupType type, const char *cs)
Look for a preferred character set name.
Definition: charset.c:303
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check()

int mutt_ch_check ( const char *  s,
size_t  slen,
const char *  from,
const char *  to 
)

Check whether a string can be converted between encodings.

Parameters
[in]sString to check
[in]slenLength of the string to check
[in]fromCurrent character set
[in]toTarget character set
Return values
0Success
-1Error in iconv_open()
>0Errno as set by iconv()

Definition at line 796 of file charset.c.

797{
798 if (!s || !from || !to)
799 return -1;
800
801 int rc = 0;
802 iconv_t cd = mutt_ch_iconv_open(to, from, MUTT_ICONV_NO_FLAGS);
803 if (!iconv_t_valid(cd))
804 return -1;
805
806 size_t outlen = MB_LEN_MAX * slen;
807 char *out = mutt_mem_malloc(outlen + 1);
808 char *saved_out = out;
809
810 const size_t convlen = iconv(cd, (ICONV_CONST char **) &s, &slen, &out, &outlen);
811 if (convlen == ICONV_ILLEGAL_SEQ)
812 rc = errno;
813
814 FREE(&saved_out);
815 return rc;
816}
void * mutt_mem_malloc(size_t size)
Allocate memory on the heap.
Definition: memory.c:91
#define FREE(x)
Definition: memory.h:45
iconv_t mutt_ch_iconv_open(const char *tocode, const char *fromcode, uint8_t flags)
Set up iconv for conversions.
Definition: charset.c:594
#define MUTT_ICONV_NO_FLAGS
No flags are set.
Definition: charset.h:73
#define ICONV_ILLEGAL_SEQ
Error value for iconv() - Illegal sequence.
Definition: charset.h:105
static bool iconv_t_valid(const iconv_t cd)
Is the conversion descriptor valid?
Definition: charset.h:114
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_check_charset()

bool mutt_ch_check_charset ( const char *  cs,
bool  strict 
)

Does iconv understand a character set?

Parameters
csCharacter set to check
strictCheck strictly by using iconv
Return values
trueCharacter set is valid

If strict is false, then finding a matching character set in PreferredMimeNames will be enough. If strict is true, or the charset is not in PreferredMimeNames, then iconv() with be run.

Definition at line 894 of file charset.c.

895{
896 if (!cs)
897 return false;
898
899 if (mutt_ch_is_utf8(cs))
900 return true;
901
902 if (!strict)
903 {
904 for (int i = 0; PreferredMimeNames[i].key; i++)
905 {
906 if (mutt_istr_equal(PreferredMimeNames[i].key, cs) ||
908 {
909 return true;
910 }
911 }
912 }
913
914 iconv_t cd = mutt_ch_iconv_open(cs, cs, MUTT_ICONV_NO_FLAGS);
915 if (iconv_t_valid(cd))
916 {
917 return true;
918 }
919
920 return false;
921}
#define mutt_ch_is_utf8(str)
Definition: charset.h:98
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_choose()

char * mutt_ch_choose ( const char *  fromcode,
const struct Slist charsets,
const char *  u,
size_t  ulen,
char **  d,
size_t *  dlen 
)

Figure the best charset to encode a string.

Parameters
[in]fromcodeOriginal charset of the string
[in]charsetsList of potential charsets to use
[in]uString to encode
[in]ulenLength of the string to encode
[out]dIf not NULL, point it to the converted string
[out]dlenIf not NULL, point it to the length of the d string
Return values
ptrBest performing charset
NULLNone could be found

Definition at line 1111 of file charset.c.

1113{
1114 if (!fromcode || !charsets)
1115 return NULL;
1116
1117 char *e = NULL, *tocode = NULL;
1118 size_t elen = 0, bestn = 0;
1119
1120 const struct ListNode *np = NULL;
1121 STAILQ_FOREACH(np, &charsets->head, entries)
1122 {
1123 char *t = mutt_str_dup(np->data);
1124 if (!t)
1125 continue;
1126
1127 size_t n = mutt_str_len(t);
1128 char *s = mutt_strn_dup(u, ulen);
1129 const int rc = d ? mutt_ch_convert_string(&s, fromcode, t, MUTT_ICONV_NO_FLAGS) :
1130 mutt_ch_check(s, ulen, fromcode, t);
1131 if (rc)
1132 {
1133 FREE(&t);
1134 FREE(&s);
1135 continue;
1136 }
1137 size_t slen = mutt_str_len(s);
1138
1139 if (!tocode || (n < bestn))
1140 {
1141 bestn = n;
1142 FREE(&tocode);
1143 tocode = t;
1144 if (d)
1145 {
1146 FREE(&e);
1147 e = s;
1148 }
1149 else
1150 {
1151 FREE(&s);
1152 }
1153 elen = slen;
1154 }
1155 else
1156 {
1157 FREE(&t);
1158 FREE(&s);
1159 }
1160 }
1161 if (tocode)
1162 {
1163 if (d)
1164 *d = e;
1165 if (dlen)
1166 *dlen = elen;
1167
1168 char canonical_buf[1024] = { 0 };
1169 mutt_ch_canonical_charset(canonical_buf, sizeof(canonical_buf), tocode);
1170 mutt_str_replace(&tocode, canonical_buf);
1171 }
1172 return tocode;
1173}
void mutt_ch_canonical_charset(char *buf, size_t buflen, const char *name)
Canonicalise the charset of a string.
Definition: charset.c:374
int mutt_ch_convert_string(char **ps, const char *from, const char *to, uint8_t flags)
Convert a string between encodings.
Definition: charset.c:831
int mutt_ch_check(const char *s, size_t slen, const char *from, const char *to)
Check whether a string can be converted between encodings.
Definition: charset.c:796
char * mutt_strn_dup(const char *begin, size_t len)
Duplicate a sub-string.
Definition: string.c:380
char * mutt_str_dup(const char *str)
Copy a string, safely.
Definition: string.c:253
size_t mutt_str_len(const char *a)
Calculate the length of a string, safely.
Definition: string.c:496
char * mutt_str_replace(char **p, const char *s)
Replace one string with another.
Definition: string.c:280
#define STAILQ_FOREACH(var, head, field)
Definition: queue.h:352
A List node for strings.
Definition: list.h:36
char * data
String.
Definition: list.h:37
struct ListHead head
List containing values.
Definition: slist.h:38
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_chscmp()

bool mutt_ch_chscmp ( const char *  cs1,
const char *  cs2 
)

Are the names of two character sets equivalent?

Parameters
cs1First character set
cs2Second character set
Return values
trueNames are equivalent
falseNames differ

Charsets may have extensions that mutt_ch_canonical_charset() leaves intact; we expect 'cs2' to originate from neomutt code, not user input (i.e. 'cs2' does not have any extension) we simply check if the shorter string is a prefix for the longer.

Definition at line 442 of file charset.c.

443{
444 if (!cs1 || !cs2)
445 return false;
446
447 char buf[256] = { 0 };
448
449 mutt_ch_canonical_charset(buf, sizeof(buf), cs1);
450
451 int len1 = mutt_str_len(buf);
452 int len2 = mutt_str_len(cs2);
453
454 return mutt_istrn_equal(((len1 > len2) ? buf : cs2),
455 ((len1 > len2) ? cs2 : buf), MIN(len1, len2));
456}
#define MIN(a, b)
Definition: memory.h:32
bool mutt_istrn_equal(const char *a, const char *b, size_t num)
Check for equality of two strings ignoring case (to a maximum), safely.
Definition: string.c:453
+ Here is the call graph for this function:

◆ mutt_ch_convert_nonmime_string()

int mutt_ch_convert_nonmime_string ( const struct Slist *const  assumed_charset,
const char *  charset,
char **  ps 
)

Try to convert a string using a list of character sets.

Parameters
[in]assumed_charsetFrom $assumed_charset
[in]charsetFrom $charset
[in,out]psString to be converted
Return values
0Success
-1Error

Work through $assumed_charset looking for a character set conversion that works. Failing that, try mutt_ch_get_default_charset().

Definition at line 331 of file charset.c.

333{
334 if (!ps)
335 return -1;
336
337 char *u = *ps;
338 const size_t ulen = mutt_str_len(u);
339 if (ulen == 0)
340 return 0;
341
342 const struct ListNode *np = NULL;
343 STAILQ_FOREACH(np, &assumed_charset->head, entries)
344 {
345 char const *c = np->data;
346 size_t n = mutt_str_len(c);
347 char *fromcode = mutt_mem_malloc(n + 1);
348 mutt_str_copy(fromcode, c, n + 1);
349 char *s = mutt_strn_dup(u, ulen);
350 int m = mutt_ch_convert_string(&s, fromcode, charset, MUTT_ICONV_NO_FLAGS);
351 FREE(&fromcode);
352 if (m == 0)
353 {
354 FREE(ps);
355 *ps = s;
356 return 0;
357 }
358 FREE(&s);
359 }
361 charset, MUTT_ICONV_HOOK_FROM);
362 return -1;
363}
const char * mutt_ch_get_default_charset(const struct Slist *const assumed_charset)
Get the default character set.
Definition: charset.c:465
#define MUTT_ICONV_HOOK_FROM
apply charset-hooks to fromcode
Definition: charset.h:74
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_convert_string()

int mutt_ch_convert_string ( char **  ps,
const char *  from,
const char *  to,
uint8_t  flags 
)

Convert a string between encodings.

Parameters
[in,out]psString to convert
[in]fromCurrent character set
[in]toTarget character set
[in]flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
0Success
-1Invalid arguments or failure to open an iconv channel
errnoFailure in iconv conversion

Parameter flags is given as-is to mutt_ch_iconv_open(). See there for its meaning and usage policy.

Definition at line 831 of file charset.c.

832{
833 if (!ps)
834 return -1;
835
836 char *s = *ps;
837
838 if (!s || (*s == '\0'))
839 return 0;
840
841 if (!to || !from)
842 return -1;
843
844 const char *repls[] = { "\357\277\275", "?", 0 };
845 int rc = 0;
846
847 iconv_t cd = mutt_ch_iconv_open(to, from, flags);
848 if (!iconv_t_valid(cd))
849 return -1;
850
851 const char **inrepls = NULL;
852 const char *outrepl = NULL;
853
854 if (mutt_ch_is_utf8(to))
855 outrepl = "\357\277\275";
856 else if (mutt_ch_is_utf8(from))
857 inrepls = repls;
858 else
859 outrepl = "?";
860
861 const char *ib = s;
862 size_t ibl = strlen(s);
863 if (ibl >= (SIZE_MAX / MB_LEN_MAX))
864 {
865 return -1;
866 }
867 size_t obl = MB_LEN_MAX * ibl;
868 char *buf = mutt_mem_malloc(obl + 1);
869 char *ob = buf;
870
871 mutt_ch_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl, &rc);
872 iconv(cd, 0, 0, &ob, &obl);
873
874 *ob = '\0';
875
876 FREE(ps);
877 *ps = buf;
878
879 mutt_str_adjust(ps);
880 return rc;
881}
size_t mutt_ch_iconv(iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, const char **inrepls, const char *outrepl, int *iconverrno)
Change the encoding of a string.
Definition: charset.c:697
void mutt_str_adjust(char **ptr)
Shrink-to-fit a string.
Definition: string.c:299
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv()

int mutt_ch_fgetconv ( struct FgetConv fc)

Convert a file's character set.

Parameters
fcFgetConv handle
Return values
numNext character in the converted file
EOFError

A file is read into a buffer and its character set is converted. Each call to this function will return one converted character. The buffer is refilled automatically when empty.

Definition at line 983 of file charset.c.

984{
985 if (!fc)
986 return EOF;
987 if (!iconv_t_valid(fc->cd))
988 return fgetc(fc->fp);
989 if (!fc->p)
990 return EOF;
991 if (fc->p < fc->ob)
992 return (unsigned char) *(fc->p)++;
993
994 /* Try to convert some more */
995 fc->p = fc->bufo;
996 fc->ob = fc->bufo;
997 if (fc->ibl)
998 {
999 size_t obl = sizeof(fc->bufo);
1000 iconv(fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
1001 if (fc->p < fc->ob)
1002 return (unsigned char) *(fc->p)++;
1003 }
1004
1005 /* If we trusted iconv a bit more, we would at this point
1006 * ask why it had stopped converting ... */
1007
1008 /* Try to read some more */
1009 if ((fc->ibl == sizeof(fc->bufi)) ||
1010 (fc->ibl && (fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))))
1011 {
1012 fc->p = 0;
1013 return EOF;
1014 }
1015 if (fc->ibl)
1016 memcpy(fc->bufi, fc->ib, fc->ibl);
1017 fc->ib = fc->bufi;
1018 fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl, fc->fp);
1019
1020 /* Try harder this time to convert some */
1021 if (fc->ibl)
1022 {
1023 size_t obl = sizeof(fc->bufo);
1024 mutt_ch_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl,
1025 fc->inrepls, 0, NULL);
1026 if (fc->p < fc->ob)
1027 return (unsigned char) *(fc->p)++;
1028 }
1029
1030 /* Either the file has finished or one of the buffers is too small */
1031 fc->p = 0;
1032 return EOF;
1033}
char bufi[512]
Definition: charset.h:46
iconv_t cd
iconv conversion descriptor
Definition: charset.h:45
char bufo[512]
Definition: charset.h:47
size_t ibl
Definition: charset.h:51
FILE * fp
Definition: charset.h:44
char * p
Definition: charset.h:48
const char ** inrepls
Definition: charset.h:52
char * ib
Definition: charset.h:50
char * ob
Definition: charset.h:49
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_close()

void mutt_ch_fgetconv_close ( struct FgetConv **  ptr)

Close an fgetconv handle.

Parameters
[out]ptrfgetconv handle

Definition at line 965 of file charset.c.

966{
967 if (!ptr || !*ptr)
968 return;
969
970 FREE(ptr);
971}
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconv_open()

struct FgetConv * mutt_ch_fgetconv_open ( FILE *  fp,
const char *  from,
const char *  to,
uint8_t  flags 
)

Prepare a file for charset conversion.

Parameters
fpFILE ptr to prepare
fromCurrent character set
toDestination character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptrfgetconv handle

Parameter flags is given as-is to mutt_ch_iconv_open().

Definition at line 933 of file charset.c.

934{
935 struct FgetConv *fc = NULL;
936 iconv_t cd = ICONV_T_INVALID;
937
938 if (from && to)
939 cd = mutt_ch_iconv_open(to, from, flags);
940
941 if (iconv_t_valid(cd))
942 {
943 static const char *repls[] = { "\357\277\275", "?", 0 };
944
945 fc = mutt_mem_malloc(sizeof(struct FgetConv));
946 fc->p = fc->bufo;
947 fc->ob = fc->bufo;
948 fc->ib = fc->bufi;
949 fc->ibl = 0;
950 fc->inrepls = mutt_ch_is_utf8(to) ? repls : repls + 1;
951 }
952 else
953 {
954 fc = mutt_mem_malloc(sizeof(struct FgetConvNot));
955 }
956 fc->fp = fp;
957 fc->cd = cd;
958 return fc;
959}
#define ICONV_T_INVALID
Error value for iconv functions.
Definition: charset.h:102
A dummy converter.
Definition: charset.h:59
Cursor for converting a file's encoding.
Definition: charset.h:43
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_fgetconvs()

char * mutt_ch_fgetconvs ( char *  buf,
size_t  buflen,
struct FgetConv fc 
)

Convert a file's charset into a string buffer.

Parameters
bufBuffer for result
buflenLength of buffer
fcFgetConv handle
Return values
ptrSuccess, result buffer
NULLError

Read a file into a buffer, converting the character set as it goes.

Definition at line 1045 of file charset.c.

1046{
1047 if (!buf)
1048 return NULL;
1049
1050 size_t r;
1051 for (r = 0; (r + 1) < buflen;)
1052 {
1053 const int c = mutt_ch_fgetconv(fc);
1054 if (c == EOF)
1055 break;
1056 buf[r++] = (char) c;
1057 if (c == '\n')
1058 break;
1059 }
1060 buf[r] = '\0';
1061
1062 if (r > 0)
1063 return buf;
1064
1065 return NULL;
1066}
int mutt_ch_fgetconv(struct FgetConv *fc)
Convert a file's character set.
Definition: charset.c:983
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_default_charset()

const char * mutt_ch_get_default_charset ( const struct Slist *const  assumed_charset)

Get the default character set.

Parameters
assumed_charsetFrom $assumed_charset
Return values
ptrName of the default character set
Warning
This returns a pointer to a static buffer. Do not free it.

Definition at line 465 of file charset.c.

466{
467 static char fcharset[128];
468 const char *c = NULL;
469
470 if (assumed_charset && (assumed_charset->count > 0))
471 c = STAILQ_FIRST(&assumed_charset->head)->data;
472 else
473 c = "us-ascii";
474
475 mutt_str_copy(fcharset, c, sizeof(fcharset));
476 return fcharset;
477}
#define STAILQ_FIRST(head)
Definition: queue.h:350
size_t count
Number of values in list.
Definition: slist.h:39
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_get_langinfo_charset()

char * mutt_ch_get_langinfo_charset ( void  )

Get the user's choice of character set.

Return values
ptrCharset string

Get the canonical character set used by the user's locale. The caller must free the returned string.

Definition at line 486 of file charset.c.

487{
488 char buf[1024] = { 0 };
489
490 mutt_ch_canonical_charset(buf, sizeof(buf), nl_langinfo(CODESET));
491
492 if (buf[0] != '\0')
493 return mutt_str_dup(buf);
494
495 return mutt_str_dup("iso-8859-1");
496}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv()

size_t mutt_ch_iconv ( iconv_t  cd,
const char **  inbuf,
size_t *  inbytesleft,
char **  outbuf,
size_t *  outbytesleft,
const char **  inrepls,
const char *  outrepl,
int *  iconverrno 
)

Change the encoding of a string.

Parameters
[in]cdIconv conversion descriptor
[in,out]inbufBuffer to convert
[in,out]inbytesleftLength of buffer to convert
[in,out]outbufBuffer for the result
[in,out]outbytesleftLength of result buffer
[in]inreplsInput replacement characters
[in]outreplOutput replacement characters
[out]iconverrnoErrno if iconv() fails, 0 if it succeeds
Return values
numCharacters converted

Like iconv, but keeps going even when the input is invalid If you're supplying inrepls, the source charset should be stateless; if you're supplying an outrepl, the target charset should be.

Definition at line 697 of file charset.c.

700{
701 size_t rc = 0;
702 const char *ib = *inbuf;
703 size_t ibl = *inbytesleft;
704 char *ob = *outbuf;
705 size_t obl = *outbytesleft;
706
707 while (true)
708 {
709 errno = 0;
710 const size_t ret1 = iconv(cd, (ICONV_CONST char **) &ib, &ibl, &ob, &obl);
711 if (ret1 != ICONV_ILLEGAL_SEQ)
712 rc += ret1;
713 if (iconverrno)
714 *iconverrno = errno;
715
716 if (ibl && obl && (errno == EILSEQ))
717 {
718 if (inrepls)
719 {
720 /* Try replacing the input */
721 const char **t = NULL;
722 for (t = inrepls; *t; t++)
723 {
724 const char *ib1 = *t;
725 size_t ibl1 = strlen(*t);
726 char *ob1 = ob;
727 size_t obl1 = obl;
728 iconv(cd, (ICONV_CONST char **) &ib1, &ibl1, &ob1, &obl1);
729 if (ibl1 == 0)
730 {
731 ib++;
732 ibl--;
733 ob = ob1;
734 obl = obl1;
735 rc++;
736 break;
737 }
738 }
739 if (*t)
740 continue;
741 }
742 /* Replace the output */
743 if (!outrepl)
744 outrepl = "?";
745 iconv(cd, NULL, NULL, &ob, &obl);
746 if (obl)
747 {
748 int n = strlen(outrepl);
749 if (n > obl)
750 {
751 outrepl = "?";
752 n = 1;
753 }
754 memcpy(ob, outrepl, n);
755 ib++;
756 ibl--;
757 ob += n;
758 obl -= n;
759 rc++;
760 iconv(cd, NULL, NULL, NULL, NULL); /* for good measure */
761 continue;
762 }
763 }
764 *inbuf = ib;
765 *inbytesleft = ibl;
766 *outbuf = ob;
767 *outbytesleft = obl;
768 return rc;
769 }
770}
#define EILSEQ
Definition: charset.c:55
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_lookup()

const char * mutt_ch_iconv_lookup ( const char *  chs)

Look for a replacement character set.

Parameters
chsCharacter set to lookup
Return values
ptrReplacement character set (if a 'iconv-hook' matches)
NULLNo matching hook

Look through all the 'iconv-hook's. If one matches return the replacement character set.

Definition at line 781 of file charset.c.

782{
784}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_iconv_open()

iconv_t mutt_ch_iconv_open ( const char *  tocode,
const char *  fromcode,
uint8_t  flags 
)

Set up iconv for conversions.

Parameters
tocodeCurrent character set
fromcodeTarget character set
flagsFlags, e.g. MUTT_ICONV_HOOK_FROM
Return values
ptriconv handle for the conversion

Like iconv_open, but canonicalises the charsets, applies charset-hooks, recanonicalises, and finally applies iconv-hooks. Parameter flags=0 skips charset-hooks, while MUTT_ICONV_HOOK_FROM applies them to fromcode. Callers should use flags=0 when fromcode can safely be considered true, either some constant, or some value provided by the user; MUTT_ICONV_HOOK_FROM should be used only when fromcode is unsure, taken from a possibly wrong incoming MIME label, or such. Misusing MUTT_ICONV_HOOK_FROM leads to unwanted interactions in some setups.

Since calling iconv_open() repeatedly can be expensive, we keep a cache of the most recently used iconv_t objects, kept in LRU order. This means that you should not call iconv_close() on the object yourself. All remaining objects in the cache will exit when main() calls mutt_ch_cache_cleanup().

Note
By design charset-hooks should never be, and are never, applied to tocode.
The top-well-named MUTT_ICONV_HOOK_FROM acts on charset-hooks, not at all on iconv-hooks.

Definition at line 594 of file charset.c.

595{
596 char tocode1[128] = { 0 };
597 char fromcode1[128] = { 0 };
598 const char *tocode2 = NULL, *fromcode2 = NULL;
599 const char *tmp = NULL;
600
601 /* transform to MIME preferred charset names */
602 mutt_ch_canonical_charset(tocode1, sizeof(tocode1), tocode);
603 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), fromcode);
604
605 /* maybe apply charset-hooks and recanonicalise fromcode,
606 * but only when caller asked us to sanitize a potentially wrong
607 * charset name incoming from the wild exterior. */
608 if (flags & MUTT_ICONV_HOOK_FROM)
609 {
610 tmp = mutt_ch_charset_lookup(fromcode1);
611 if (tmp)
612 mutt_ch_canonical_charset(fromcode1, sizeof(fromcode1), tmp);
613 }
614
615 /* check if we have this pair cached already */
616 for (int i = 0; i < IconvCacheUsed; ++i)
617 {
618 if (strcmp(tocode1, IconvCache[i].tocode1) == 0 &&
619 strcmp(fromcode1, IconvCache[i].fromcode1) == 0)
620 {
621 iconv_t cd = IconvCache[i].cd;
622
623 /* make room for this one at the top */
624 struct IconvCacheEntry top = IconvCache[i];
625 for (int j = i; j-- > 0;)
626 {
627 IconvCache[j + 1] = IconvCache[j];
628 }
629 IconvCache[0] = top;
630
631 if (iconv_t_valid(cd))
632 {
633 /* reset state */
634 iconv(cd, NULL, NULL, NULL, NULL);
635 }
636 return cd;
637 }
638 }
639
640 /* not found in cache */
641 /* always apply iconv-hooks to suit system's iconv tastes */
642 tocode2 = mutt_ch_iconv_lookup(tocode1);
643 tocode2 = tocode2 ? tocode2 : tocode1;
644 fromcode2 = mutt_ch_iconv_lookup(fromcode1);
645 fromcode2 = fromcode2 ? fromcode2 : fromcode1;
646
647 /* call system iconv with names it appreciates */
648 iconv_t cd = iconv_open(tocode2, fromcode2);
649
651 {
652 mutt_debug(LL_DEBUG2, "iconv: dropping %s -> %s from the cache\n",
655 /* get rid of the oldest entry */
659 {
660 iconv_close(IconvCache[IconvCacheUsed - 1].cd);
661 }
663 }
664
665 /* make room for this one at the top */
666 for (int j = IconvCacheUsed; j-- > 0;)
667 {
668 IconvCache[j + 1] = IconvCache[j];
669 }
670
672
673 mutt_debug(LL_DEBUG2, "iconv: adding %s -> %s to the cache\n", fromcode1, tocode1);
674 IconvCache[0].fromcode1 = strdup(fromcode1);
675 IconvCache[0].tocode1 = strdup(tocode1);
676 IconvCache[0].cd = cd;
677
678 return cd;
679}
#define mutt_debug(LEVEL,...)
Definition: logging2.h:89
@ LL_DEBUG2
Log at debug level 2.
Definition: logging2.h:44
static int IconvCacheUsed
Number of iconv descriptors in the cache.
Definition: charset.c:100
const char * mutt_ch_iconv_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:781
const char * mutt_ch_charset_lookup(const char *chs)
Look for a replacement character set.
Definition: charset.c:562
#define ICONV_CACHE_SIZE
Max size of the iconv cache.
Definition: charset.c:96
static struct IconvCacheEntry IconvCache[ICONV_CACHE_SIZE]
Cache of iconv conversion descriptors.
Definition: charset.c:98
Cached iconv conversion descriptor.
Definition: charset.c:89
char * tocode1
Destination character set.
Definition: charset.c:91
char * fromcode1
Source character set.
Definition: charset.c:90
iconv_t cd
iconv conversion descriptor
Definition: charset.c:92
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_add()

bool mutt_ch_lookup_add ( enum LookupType  type,
const char *  pat,
const char *  replace,
struct Buffer err 
)

Add a new character set lookup.

Parameters
typeType of character set, e.g. MUTT_LOOKUP_CHARSET
patPattern to match
replaceReplacement string
errBuffer for error message
Return values
trueLookup added to list
falseRegex string was invalid

Add a regex for a character set and a replacement name.

Definition at line 509 of file charset.c.

511{
512 if (!pat || !replace)
513 return false;
514
515 regex_t *rx = mutt_mem_calloc(1, sizeof(regex_t));
516 int rc = REG_COMP(rx, pat, REG_ICASE);
517 if (rc != 0)
518 {
519 regerror(rc, rx, err->data, err->dsize);
520 FREE(&rx);
521 return false;
522 }
523
524 struct Lookup *l = lookup_new();
525 l->type = type;
526 l->replacement = mutt_str_dup(replace);
527 l->regex.pattern = mutt_str_dup(pat);
528 l->regex.regex = rx;
529 l->regex.pat_not = false;
530
531 TAILQ_INSERT_TAIL(&Lookups, l, entries);
532
533 return true;
534}
void * mutt_mem_calloc(size_t nmemb, size_t size)
Allocate zeroed memory on the heap.
Definition: memory.c:51
static struct LookupList Lookups
Lookup table of preferred character set names.
Definition: charset.c:83
static struct Lookup * lookup_new(void)
Create a new Lookup.
Definition: charset.c:269
#define TAILQ_INSERT_TAIL(head, elm, field)
Definition: queue.h:809
#define REG_COMP(preg, regex, cflags)
Compile a regular expression.
Definition: regex3.h:49
size_t dsize
Length of data.
Definition: buffer.h:39
char * data
Pointer to data.
Definition: buffer.h:37
Regex to String lookup table.
Definition: charset.c:74
char * replacement
Alternative charset to use.
Definition: charset.c:77
enum LookupType type
Lookup type.
Definition: charset.c:75
struct Regex regex
Regular expression.
Definition: charset.c:76
char * pattern
printable version
Definition: regex3.h:86
bool pat_not
do not match
Definition: regex3.h:88
regex_t * regex
compiled expression
Definition: regex3.h:87
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_lookup_remove()

void mutt_ch_lookup_remove ( void  )

Remove all the character set lookups.

Empty the list of replacement character set names.

Definition at line 541 of file charset.c.

542{
543 struct Lookup *l = NULL;
544 struct Lookup *tmp = NULL;
545
546 TAILQ_FOREACH_SAFE(l, &Lookups, entries, tmp)
547 {
548 TAILQ_REMOVE(&Lookups, l, entries);
549 lookup_free(&l);
550 }
551}
static void lookup_free(struct Lookup **ptr)
Free a Lookup.
Definition: charset.c:278
#define TAILQ_FOREACH_SAFE(var, head, field, tvar)
Definition: queue.h:735
#define TAILQ_REMOVE(head, elm, field)
Definition: queue.h:841
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_set_charset()

void mutt_ch_set_charset ( const char *  charset)

Update the records for a new character set.

Parameters
charsetNew character set

Check if this character set is utf-8 and pick a suitable replacement character for unprintable characters.

Note
This calls bind_textdomain_codeset() which will affect future message translations.

Definition at line 1078 of file charset.c.

1079{
1080 char buf[256] = { 0 };
1081
1082 mutt_ch_canonical_charset(buf, sizeof(buf), charset);
1083
1084 if (mutt_ch_is_utf8(buf))
1085 {
1086 CharsetIsUtf8 = true;
1087 ReplacementChar = 0xfffd; /* replacement character */
1088 }
1089 else
1090 {
1091 CharsetIsUtf8 = false;
1092 ReplacementChar = '?';
1093 }
1094
1095#if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS)
1096 bind_textdomain_codeset(PACKAGE, buf);
1097#endif
1098}
bool CharsetIsUtf8
Is the user's current character set utf-8?
Definition: charset.c:66
wchar_t ReplacementChar
When a Unicode character can't be displayed, use this instead.
Definition: charset.c:61
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ mutt_ch_cache_cleanup()

void mutt_ch_cache_cleanup ( void  )

Clean up the cached iconv handles and charset strings.

Definition at line 1178 of file charset.c.

1179{
1180 for (int i = 0; i < IconvCacheUsed; ++i)
1181 {
1182 FREE(&IconvCache[i].fromcode1);
1183 FREE(&IconvCache[i].tocode1);
1184 if (iconv_t_valid(IconvCache[i].cd))
1185 {
1186 iconv_close(IconvCache[i].cd);
1187 }
1188 }
1189 IconvCacheUsed = 0;
1190}
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ iconv_t_valid()

static bool iconv_t_valid ( const iconv_t  cd)
inlinestatic

Is the conversion descriptor valid?

Parameters
cdConversion descriptor to test
Return values
trueIt's valid

Definition at line 114 of file charset.h.

115{
116 return cd != ICONV_T_INVALID;
117}
+ Here is the caller graph for this function:

Variable Documentation

◆ CharsetIsUtf8

bool CharsetIsUtf8
extern

Is the user's current character set utf-8?

Definition at line 66 of file charset.c.

◆ ReplacementChar

wchar_t ReplacementChar
extern

When a Unicode character can't be displayed, use this instead.

Definition at line 61 of file charset.c.