Module:Text: Difference between revisions

From AusMetal Guide
Jump to navigation Jump to search
m (1 revision imported)
(factor data out into Module:Text/data, loaded via mw.loadData)
Line 1: Line 1:
local Text = { serial = "2017-11-01",
local yesNo = require("Module:Yesno")
local Text = { serial = "2022-07-21",
               suite  = "Text" }
               suite  = "Text" }
--[=[
--[=[
Text utilities
Text utilities
]=]
]=]
-- local globals
local PatternCJK        = false
local PatternCombined  = false
local PatternLatin      = false
local PatternTerminated = false
local QuoteLang        = false
local QuoteType        = false
local RangesLatin      = false
local SeekQuote        = false
local function factoryQuote()
    -- Create quote definitions
    QuoteLang = { af        = "bd",
                  ar        = "la",
                  be        = "labd",
                  bg        = "bd",
                  ca        = "la",
                  cs        = "bd",
                  da        = "bd",
                  de        = "bd",
                  dsb      = "bd",
                  et        = "bd",
                  el        = "lald",
                  en        = "ld",
                  es        = "la",
                  eu        = "la",
            --    fa        = "la",
                  fi        = "rd",
                  fr        = "laSPC",
                  ga        = "ld",
                  he        = "ldla",
                  hr        = "bd",
                  hsb      = "bd",
                  hu        = "bd",
                  hy        = "labd",
                  id        = "rd",
                  is        = "bd",
                  it        = "ld",
                  ja        = "x300C",
                  ka        = "bd",
                  ko        = "ld",
                  lt        = "bd",
                  lv        = "bd",
                  nl        = "ld",
                  nn        = "la",
                  no        = "la",
                  pl        = "bdla",
                  pt        = "lald",
                  ro        = "bdla",
                  ru        = "labd",
                  sk        = "bd",
                  sl        = "bd",
                  sq        = "la",
                  sr        = "bx",
                  sv        = "rd",
                  th        = "ld",
                  tr        = "ld",
                  uk        = "la",
                  zh        = "ld",
                  ["de-ch"] = "la",
                  ["en-gb"] = "lsld",
                  ["en-us"] = "ld",
                  ["fr-ch"] = "la",
                  ["it-ch"] = "la",
                  ["pt-br"] = "ldla",
                  ["zh-tw"] = "x300C",
                  ["zh-cn"] = "ld" }
    QuoteType = { bd    = { { 8222, 8220 },  { 8218, 8217 } },
                  bdla  = { { 8222, 8220 },  {  171,  187 } },
                  bx    = { { 8222, 8221 },  { 8218, 8217 } },
                  la    = { {  171,  187 },  { 8249, 8250 } },
                  laSPC = { {  171,  187 },  { 8249, 8250 },  true },
                  labd  = { {  171,  187 },  { 8222, 8220 } },
                  lald  = { {  171,  187 },  { 8220, 8221 } },
                  ld    = { { 8220, 8221 },  { 8216, 8217 } },
                  ldla  = { { 8220, 8221 },  {  171,  187 } },
                  lsld  = { { 8216, 8217 },  { 8220, 8221 } },
                  rd    = { { 8221, 8221 },  { 8217, 8217 } },
                  x300C = { { 0x300C, 0x300D },
                            { 0x300E, 0x300F } } }
    return r
end -- factoryQuote()


local function fiatQuote( apply, alien, advance )
local function fiatQuote( apply, alien, advance )
Line 100: Line 12:
     --    alien    -- string, with language code
     --    alien    -- string, with language code
     --    advance  -- number, with level 1 or 2
     --    advance  -- number, with level 1 or 2
     local r = apply
     local r = apply and tostring(apply) or ""
    alien = alien or "en"
    advance = tonumber(advance) or 0
     local suite
     local suite
     if not QuoteLang then
     local data = mw.loadData('Module:Text/data')
        factoryQuote()
     local QuoteLang = data.QuoteLang
     end
     local QuoteType = data.QuoteType
     suite = QuoteLang[ alien ]
     local slang = alien:match( "^(%l+)-" )
     if not suite then
    suite = QuoteLang[alien] or slang and QuoteLang[slang] or QuoteLang["en"]
        local slang = alien:match( "^(%l+)-" )
        if slang then
            suite = QuoteLang[ slang ]
        end
        if not suite then
            suite = QuoteLang[ "en" ]
        end
    end
     if suite then
     if suite then
         local quotes = QuoteType[ suite ]
         local quotes = QuoteType[ suite ]
Line 149: Line 55:
     --    accept  -- true, if no error messages to be appended
     --    accept  -- true, if no error messages to be appended
     -- Returns: string
     -- Returns: string
     local r
     local r = ""
     if type( apply ) == "table" then
     apply = type(apply) == "table" and apply or {}
        local bad  = { }
    again = math.floor(tonumber(again) or 1)
        local codes = { }
    if again < 1 then
        local s
    return ""
        for k, v in pairs( apply ) do
    end
            s = type( v )
    local bad  = { }
            if s == "number" then
    local codes = { }
                if v < 32 and v ~= 9 and v ~= 10 then
    for _, v in ipairs( apply ) do
                    v = tostring( v )
    local n = tonumber(v)
                else
    if not n or (n < 32 and n ~= 9 and n ~= 10) then
                    v = math.floor( v )
    table.insert(bad, tostring(v))
                    s = false
    else
                end
    table.insert(codes, math.floor(n))
            elseif s ~= "string" then
end
                v = tostring( v )
    end  
            end
    if #bad > 0 then
            if s then
    if not accept then
                table.insert( bad, v )
    r = tostring(  mw.html.create( "span" )
            else
                    :addClass( "error" )
                table.insert( codes, v )
                    :wikitext( "bad codepoints: " .. table.concat( bad, " " )) )
            end
    end
        end -- for k, v
    return r
        if #bad == 0 then
            if #codes > 0 then
                r = mw.ustring.char( unpack( codes ) )
                if again then
                    if type( again ) == "number" then
                        local n = math.floor( again )
                        if n > 1 then
                            r = r:rep( n )
                        elseif n < 1 then
                            r = ""
                        end
                    else
                        s = "bad repetitions: " .. tostring( again )
                    end
                end
            end
        else
            s = "bad codepoints: " .. table.concat( bad, " " )
        end
        if s  and  not accept then
            r = tostring(  mw.html.create( "span" )
                                  :addClass( "error" )
                                  :wikitext( s ) )
        end
     end
     end
     return r or ""
    if #codes > 0 then
    r = mw.ustring.char( unpack( codes ) )
    if again > 1 then
    r = r:rep(again)
    end
end
     return r
end -- Text.char()
end -- Text.char()


 
local function trimAndFormat(args, fmt)
local result = {}
if type(args) ~= 'table' then
args = {args}
end
for _, v in ipairs(args) do
v = mw.text.trim(tostring(v))
if v ~= "" then
table.insert(result,fmt and mw.ustring.format(fmt, v) or v)
end
end
return result
end


Text.concatParams = function ( args, apply, adapt )
Text.concatParams = function ( args, apply, adapt )
Line 210: Line 110:
     -- Returns: string
     -- Returns: string
     local collect = { }
     local collect = { }
     args = type(args) == 'table' and args or {} -- ensure args is table
     return table.concat(trimAndFormat(args,adapt), apply or "|")
    for k, v in pairs( args ) do
        if type( k ) == "number" then
            v = mw.text.trim( v )
            if v ~= "" then
                if adapt then
                    v = mw.ustring.format( adapt, v )
                end
                table.insert( collect, v )
            end
        end
    end -- for k, v
    return table.concat( collect, apply or "|" )
end -- Text.concatParams()
end -- Text.concatParams()






Text.containsCJK = function ( analyse )
Text.containsCJK = function ( s )
     -- Is any CJK code within?
     -- Is any CJK code within?
     -- Parameter:
     -- Parameter:
     --    analyse -- string
     --    s -- string
     -- Returns: true, if CJK detected
     -- Returns: true, if CJK detected
     analyse = analyse or ""
     s = s and tostring(s) or ""
     if not patternCJK then
     local patternCJK = mw.loadData('Module:Text/data').PatternCJK
        patternCJK = mw.ustring.char( 91,
     return mw.ustring.find( s, patternCJK ) ~= nil
                                    4352, 45,  4607,
                                  11904, 45,  42191,
                                  43072, 45,  43135,
                                  44032, 45,  55215,
                                  63744, 45,  64255,
                                  65072, 45,  65103,
                                  65381, 45,  65500,
                                      131072, 45, 196607,
                                      93 )
     end
    if mw.ustring.find( analyse, patternCJK ) then
    return true
    end
    return false
end -- Text.containsCJK()
end -- Text.containsCJK()


Line 258: Line 132:
--    suffix = ending delimiter
--    suffix = ending delimiter
-- Returns: stripped string
-- Returns: stripped string
s = s and tostring(s) or ""
prefix = prefix and tostring(prefix) or ""
suffix = suffix and tostring(suffix) or ""
local prefixLen = mw.ustring.len(prefix)
local prefixLen = mw.ustring.len(prefix)
local suffixLen = mw.ustring.len(suffix)
local suffixLen = mw.ustring.len(suffix)
if prefixLen == 0 or suffixLen == 0 then
return s
end
local i = s:find(prefix, 1, true)
local i = s:find(prefix, 1, true)
local r = s
local r = s
Line 288: Line 168:
end -- Text.getPlain()
end -- Text.getPlain()


 
Text.isLatinRange = function (s)
 
Text.isLatinRange = function ( adjust )
     -- Are characters expected to be latin or symbols within latin texts?
     -- Are characters expected to be latin or symbols within latin texts?
     -- Precondition:
     -- Arguments:
     --     adjust -- string, or nil for initialization
     --  s = string to analyze
     -- Returns: true, if valid for latin only
     -- Returns: true, if valid for latin only
     local r
     s = s and tostring(s) or "" --- ensure input is always string
    if not RangesLatin then
     local PatternLatin = mw.loadData('Module:Text/data').PatternLatin
        RangesLatin = { {    7, 687 },
     return mw.ustring.match(s, PatternLatin) ~= nil
                        { 7531, 7578 },
                        { 7680, 7935 },
                        { 8194, 8250 } }
     end
    if not PatternLatin then
        local range
        PatternLatin = "^["
        for i = 1, #RangesLatin do
            range = RangesLatin[ i ]
            PatternLatin = PatternLatin ..
                          mw.ustring.char( range[ 1 ], 45, range[ 2 ] )
        end    -- for i
        PatternLatin = PatternLatin .. "]*$"
     end
    if adjust then
        if mw.ustring.match( adjust, PatternLatin ) then
            r = true
        else
            r = false
        end
    end
    return r
end -- Text.isLatinRange()
end -- Text.isLatinRange()






Text.isQuote = function ( ask )
Text.isQuote = function ( s )
     -- Is this character any quotation mark?
     -- Is this character any quotation mark?
     -- Parameter:
     -- Parameter:
     --    ask  -- string, with single character
     --    s = single character to analyze
     -- Returns: true, if ask is quotation mark
     -- Returns: true, if s is quotation mark
     local r
     s = s and tostring(s) or ""
     if not SeekQuote then
     if s == "" then
        SeekQuote = mw.ustring.char(  34,      -- "
    return false
                                      39,      -- '
                                      171,      -- laquo
                                      187,      -- raquo
                                    8216,      -- lsquo
                                    8217,      -- rsquo
                                    8218,      -- sbquo
                                    8220,      -- ldquo
                                    8221,      -- rdquo
                                    8222,      -- bdquo
                                    8249,      -- lsaquo
                                    8250,      -- rsaquo
                                    0x300C,    -- CJK
                                    0x300D,    -- CJK
                                    0x300E,    -- CJK
                                    0x300F )    -- CJK
     end
     end
     if ask == "" then
     local SeekQuote = mw.loadData('Module:Text/data').SeekQuote
        r = false
     return mw.ustring.find( SeekQuote, s, 1, true ) ~= nil
     elseif mw.ustring.find( SeekQuote, ask, 1, true ) then
        r = true
    else
        r = false
    end
    return r
end -- Text.isQuote()
end -- Text.isQuote()


Line 366: Line 201:
     --    adapt  -- string (optional); format including "%s"
     --    adapt  -- string (optional); format including "%s"
     -- Returns: string
     -- Returns: string
     local collect = { }
     return mw.text.listToText(trimAndFormat(args, adapt))
    for k, v in pairs( args ) do
        if type( k ) == "number" then
            v = mw.text.trim( v )
            if v ~= "" then
                if adapt then
                    v = mw.ustring.format( adapt, v )
                end
                table.insert( collect, v )
            end
        end
    end -- for k, v
    return mw.text.listToText( collect )
end -- Text.listToText()
end -- Text.listToText()


Line 390: Line 213:
     --    advance  -- number, with level 1 or 2, or nil
     --    advance  -- number, with level 1 or 2, or nil
     -- Returns: quoted string
     -- Returns: quoted string
    apply = apply and tostring(apply) or ""
     local mode, slang
     local mode, slang
     if type( alien ) == "string" then
     if type( alien ) == "string" then
Line 417: Line 241:
     --    advance  -- number, with level 1 or 2, or nil
     --    advance  -- number, with level 1 or 2, or nil
     -- Returns: string; possibly quoted
     -- Returns: string; possibly quoted
     local r = mw.text.trim( apply )
     local r = mw.text.trim( apply and tostring(apply) or "" )
     local s = mw.ustring.sub( r, 1, 1 )
     local s = mw.ustring.sub( r, 1, 1 )
     if s ~= ""  and  not Text.isQuote( s, advance ) then
     if s ~= ""  and  not Text.isQuote( s, advance ) then
Line 437: Line 261:
     --                  or basic greek or cyrillic or symbols etc.
     --                  or basic greek or cyrillic or symbols etc.
     local cleanup, decomposed
     local cleanup, decomposed
     if not PatternCombined then
     local PatternCombined = mw.loadData('Module:Text/data').PatternCombined
        PatternCombined = mw.ustring.char( 91,
     decomposed = mw.ustring.toNFD( adjust and tostring(adjust) or "" )
                                            0x0300, 45, 0x036F,
                                            0x1AB0, 45, 0x1AFF,
                                            0x1DC0, 45, 0x1DFF,
                                            0xFE20, 45, 0xFE2F,
                                          93 )
    end
     decomposed = mw.ustring.toNFD( adjust )
     cleanup    = mw.ustring.gsub( decomposed, PatternCombined, "" )
     cleanup    = mw.ustring.gsub( decomposed, PatternCombined, "" )
     return mw.ustring.toNFC( cleanup )
     return mw.ustring.toNFC( cleanup )
Line 459: Line 276:
     -- Returns: true, if sentence terminated
     -- Returns: true, if sentence terminated
     local r
     local r
     if not PatternTerminated then
     local PatternTerminated = mw.loadData('Module:Text/data').PatternTerminated
        PatternTerminated = mw.ustring.char( 91,
                                            12290,
                                            65281,
                                            65294,
                                            65311 )
                            .. "!%.%?…][\"'%]‹›«»‘’“”]*$"
    end
     if mw.ustring.find( analyse, PatternTerminated ) then
     if mw.ustring.find( analyse, PatternTerminated ) then
         r = true
         r = true
Line 477: Line 287:




Text.ucfirstAll = function ( adjust )
Text.ucfirstAll = function ( adjust)
     -- Capitalize all words
     -- Capitalize all words
     -- Precondition:
     -- Arguments:
     --    adjust -- string
     --    adjust = string to adjust
     -- Returns: string with all first letters in upper case
     -- Returns: string with all first letters in upper case
     local r = " " .. adjust
    adjust = adjust and tostring(adjust) or ""
     local r = mw.text.decode(adjust,true)
     local i = 1
     local i = 1
     local c, j, m
     local c, j, m
     if adjust:find( "&" ) then
     m = (r ~= adjust)
        r = r:gsub( "&amp;",      "&#38;" )
    r = " "..r
            :gsub( "&lt;",      "&#60;" )
            :gsub( "&gt;",      "&#62;" )
            :gsub( "&nbsp;",    "&#160;" )
            :gsub( "&thinsp;", "&#8201;" )
            :gsub( "&zwnj;",  "&#8204;" )
            :gsub( "&zwj;",    "&#8205;" )
            :gsub( "&lrm;",    "&#8206;" )
            :gsub( "&rlm;",    "&#8207;" )
        m = true
    end
     while i do
     while i do
         i = mw.ustring.find( r, "%W%l", i )
         i = mw.ustring.find( r, "%W%l", i )
Line 511: Line 312:
     r = r:sub( 2 )
     r = r:sub( 2 )
     if m then
     if m then
        r = r:gsub(    "&#38;", "&amp;" )
    r = mw.text.encode(r)
            :gsub(    "&#60;", "&lt;" )
            :gsub(    "&#62;", "&gt;" )
            :gsub(    "&#160;", "&nbsp;" )
            :gsub(  "&#8201;", "&thinsp;" )
            :gsub(  "&#8204;", "&zwnj;" )
            :gsub(  "&#8205;", "&zwj;" )
            :gsub(  "&#8206;", "&lrm;" )
            :gsub(  "&#8207;", "&rlm;" )
            :gsub( "&#X(%x+);", "&#x%1;" )
     end
     end
     return r
     return r
end -- Text.ucfirstAll()
end -- Text.ucfirstAll()




Line 534: Line 325:
     -- Returns: string with non-latin parts enclosed in <span>
     -- Returns: string with non-latin parts enclosed in <span>
     local r
     local r
     Text.isLatinRange()
     local data = mw.loadData('Module:Text/data')
    local PatternLatin = data.PatternLatin
    local RangesLatin = data.RangesLatin
    local NumLatinRanges = data.NumLatinRanges
     if mw.ustring.match( adjust, PatternLatin ) then
     if mw.ustring.match( adjust, PatternLatin ) then
         -- latin only, horizontal dashes, quotes
         -- latin only, horizontal dashes, quotes
Line 548: Line 342:
                   -- isLatin
                   -- isLatin
                   local range
                   local range
                   for i = 1, #RangesLatin do
                  -- NumLatinRanges has to be precomputed because # does not work from loadData
                   for i = 1, NumLatinRanges do
                       range = RangesLatin[ i ]
                       range = RangesLatin[ i ]
                       if a >= range[ 1 ]  and  a <= range[ 2 ] then
                       if a >= range[ 1 ]  and  a <= range[ 2 ] then
Line 622: Line 417:
     return r
     return r
end -- Text.uprightNonlatin()
end -- Text.uprightNonlatin()




Line 628: Line 422:
     local r
     local r
     if about == "quote" then
     if about == "quote" then
         factoryQuote()
         data = mw.loadData('Module:Text/data')
         r = { }
         r = { }
         r.QuoteLang = QuoteLang
         r.QuoteLang = data.QuoteLang
         r.QuoteType = QuoteType
         r.QuoteType = data.QuoteType
     end
     end
     return r
     return r
end -- Text.test()
end -- Text.test()


-- Non Unicode-aware version of mw.text.split and mw.text.gsplit
-- based on [[phab:diffusion/ELUA/browse/master/includes/Engines/LuaCommon/lualib/mw.text.lua]]
-- These run up to 60 times faster than the Unicode-aware versions
Text.split = function ( text, pattern, plain )
local ret = {}
for m in Text.gsplit( text, pattern, plain ) do
ret[#ret+1] = m
end
return ret
end


Text.gsplit = function ( text, pattern, plain )
local s, l = 1, string.len( text )
return function ()
if s then
local e, n = string.find( text, pattern, s, plain )
local ret
if not e then
ret = string.sub( text, s )
s = nil
elseif n < e then
-- Empty separator!
ret = string.sub( text, s, e )
if e < l then
s = e + 1
else
s = nil
end
else
ret = e > s and string.sub( text, s, e - 1 ) or ''
s = n + 1
end
return ret
end
end, nil, nil
end


-- Export
-- Export
local p = { }
local p = { }
for _, func in ipairs({'containsCJK','isLatinRange','isQuote','sentenceTerminated'}) do
p[func] = function (frame)
return Text[func]( frame.args[ 1 ] or "" ) and "1" or ""
end
end
for _, func in ipairs({'getPlain','removeDiacritics','ucfirstAll','uprightNonlatin'}) do
p[func] = function (frame)
return Text[func]( frame.args[ 1 ] or "" )
end
end


function p.char( frame )
function p.char( frame )
Line 650: Line 491:
     end
     end
     if story then
     if story then
         local items = mw.text.split( story, "%s+" )
         local items = mw.text.split( mw.text.trim(story), "%s+" )
         if #items > 0 then
         if #items > 0 then
             local j
             local j
             lenient  = ( params.errors == "0" )
             lenient  = (yesNo(params.errors) == false)
             codes    = { }
             codes    = { }
             multiple = tonumber( params[ "*" ] )
             multiple = tonumber( params[ "*" ] )
             for k, v in pairs( items ) do
             for _, v in ipairs( items ) do
                if v:sub( 1, 1 ) == "x" then
            j = tonumber((v:sub( 1, 1 ) == "x" and "0" or "") .. v)
                    j = tonumber( "0" .. v )
                 table.insert( codes,  j or v )
                 elseif v == "" then
             end  
                    v = false
                else
                    j = tonumber( v )
                end
                if v then
                    table.insert( codes,  j or v )
                end
             end -- for k, v
         end
         end
     end
     end
Line 689: Line 522:
                               frame.args.format )
                               frame.args.format )
end
end
function p.containsCJK( frame )
    return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or ""
end
function p.getPlain( frame )
    return Text.getPlain( frame.args[ 1 ] or "" )
end
function p.isLatinRange( frame )
    return Text.isLatinRange( frame.args[ 1 ] or "" ) and "1" or ""
end
function p.isQuote( frame )
    return Text.isQuote( frame.args[ 1 ] or "" ) and "1" or ""
end




Line 786: Line 602:
                               tonumber( frame.args[3] ) )
                               tonumber( frame.args[3] ) )
end
end
function p.removeDiacritics( frame )
    return Text.removeDiacritics( frame.args[ 1 ] or "" )
end
function p.sentenceTerminated( frame )
    return Text.sentenceTerminated( frame.args[ 1 ] or "" ) and "1" or ""
end
function p.ucfirstAll( frame )
    return Text.ucfirstAll( frame.args[ 1 ] or "" )
end
function p.uprightNonlatin( frame )
    return Text.uprightNonlatin( frame.args[ 1 ] or "" )
end




Line 847: Line 644:
end
end


function p.split(frame)
local text = frame.args.text or frame.args[1] or ''
local pattern = frame.args.pattern or frame.args[2] or ''
local plain = yesNo(frame.args.plain or frame.args[3])
local index = tonumber(frame.args.index) or tonumber(frame.args[4]) or 1
local a = Text.split(text, pattern, plain)
if index < 0 then index = #a + index + 1 end
return a[index]
end




Line 852: Line 659:
     return Text.serial
     return Text.serial
end
end





Revision as of 14:01, 21 September 2024

Text – Module containing methods for the manipulation of text, wikimarkup and some HTML.

Functions for templates

All methods have an unnamed parameter containing the text.

The return value is an empty string if the parameter does not meet the conditions. When the condition is matched or some result is successfully found, strings of at least one character are returned.

char
Creates a string from a list of character codes.
1
Space-separated list of character codes
*
Number of repetitions of the list in parameter 1; (Default 1).
errors
0 – Silence errors
concatParams
Combine any number of elements into a list, like table.concat() in Lua.
From a template:
1
First element; missing and empty elements are ignored.
2 3 4 5 6 …
Further list elements
From Lua
args
table (sequence) of the elements
apply
Separator between elements; defaults to |
adapt
optional formatting, which will be applied to each element; must contain %s.
containsCJK
Returns whether the input string contains any CJK characters
  • Returns nothing if there are no CJK characters
removeDelimited
Remove all text between delimiters, including the delimiters themselves.
getPlain
Remove wikimarkup (except templates): comments, tags, bold, italic, nbsp
isLatinRange
Returns some content, unless the string contains a character that would not normally be found in Latin text.
  • Returns nothing if there is a non-Latin string.
isQuote
Returns some content if the parameter passed is a single character, and that character is a quote, such as '.
  • Returns nothing for multiple characters, or if the character passed is not a quote.
listToText
Formats list elements analogously to mw.text.listToText().
The elements are separated by a comma and space ; the word "and" appears between the first and last.
Unnamed parameters become the list items.
Optional parameters for #invoke:
  • format – Every list element will first be formatted with this format string; see here for how to construct this string. The string must contain at least one %s sequence.
  • template=1 – List elements should be taken from the calling template.
Returns the resulting string.
quote
Wrap the string in quotes; quotes can be chosen for a specific language.
1
Input text (will be automatically trimmed); may be empty.
2
(optional) the ISO 639 language code for the quote marks; should be one of the supported languages Template:In lang
3
(optional) 2 for second level quotes. This means the single quote marks in a statement such as: Jack said, “Jill said ‘fish’ last Tuesday.”
quoteUnquoted
Wrap the string in quotes; quotes can be chosen for a specific language. Will not quote an empty string, and will not quote if there is a quote at the start or end of the (trimmed) string.
1
Input text (will be automatically trimmed); may be empty.
2
(optional) the ISO 639 language code for the quote marks; should be one of the supported languages Template:In lang
3
(optional) 2 for second level quotes. This means the single quote marks in a statement such as: Jack said, “Jill said ‘fish’ last Tuesday.”
removeDiacritics
Removes all diacritical marks from the input.
1
Input text
sentenceTerminated
Is this sentence terminated? Should work with CJK, and allows quotation marks to follow.
  • Returns nothing if the sentence is unterminated.
ucfirstAll
The first letter of every recognized word is converted to upper case. This contrasts with the parser function {{ucfirst:}} which changes only the first character of the whole string passed.
A few common HTML entities are protected; the implementation of this may mean that numerical entities passed (e.g. &#38;) are converted to &amp; form
uprightNonlatin
Takes a string. Italicized non-Latin characters are un-italicized, unless they are a single Greek letter.
zip
Combines a tuple of lists by convolution. This is easiest to explain by example: given two lists, list1 = "a b c" and list2 = "1 2 3", then
zip(liste1, liste2, sep = " ", isep = "-", osep = "/")
outputs
a-1/b-2/c-3
  • 1, 2, 3, … – Lists to be combined
  • sep – A separator (in Lua regex form) used to split the lists. If empty, the lists are split into individual characters.
  • sep1, sep2, sep3, … – Allows a different separator to be used for each list.
  • isep – Output separator; placed between elements which were at the same index in their lists.
  • osep – Output separator; placed between elements which had different original indices; i.e. between the groups joined with isep

Examples and test page

There are tests available Template:In lang to illustrate this in practice.

Use in another Lua module

All of the above functions can be called from other Lua modules. Use require(); the below code checks for errors loading it: <syntaxhighlight lang="lua"> local lucky, Text = pcall( require, "Module:Text" ) if type( Text ) == "table" then

   Text = Text.Text()

else

   -- In the event of errors, Text is an error message.
   return "" .. Text .. ""

end </syntaxhighlight> You may then call:

  • Text.char( apply, again, accept )
  • Text.concatParams( args, separator, format )
  • Text.containsCJK( s )
  • Text.removeDelimited( s )
  • Text.getPlain( s )
  • Text.isLatinRange( s )
  • Text.isQuote( c )
  • Text.listToText( table, format )
  • Text.quote( s, lang, mode )
  • Text.quoteUnquoted( s, lang, mode )
  • Text.removeDiacritics( s )
  • Text.sentenceTerminated( s )
  • Text.ucfirstAll( s )
  • Text.uprightNonlatin( s )
  • Text.zip(…)
  • Text.test( s )

Usage

This is a general library; use it anywhere.

Dependencies

None.

See also


local yesNo = require("Module:Yesno")
local Text = { serial = "2022-07-21",
               suite  = "Text" }
--[=[
Text utilities
]=]

local function fiatQuote( apply, alien, advance )
    -- Quote text
    -- Parameter:
    --     apply    -- string, with text
    --     alien    -- string, with language code
    --     advance  -- number, with level 1 or 2
    local r = apply and tostring(apply) or ""
    alien = alien or "en"
    advance = tonumber(advance) or 0
    local suite
    local data = mw.loadData('Module:Text/data')
    local QuoteLang = data.QuoteLang
    local QuoteType = data.QuoteType
    local slang = alien:match( "^(%l+)-" )
    suite = QuoteLang[alien] or slang and QuoteLang[slang] or QuoteLang["en"]
    if suite then
        local quotes = QuoteType[ suite ]
        if quotes then
            local space
            if quotes[ 3 ] then
                space = "&#160;"
            else
                space = ""
            end
            quotes = quotes[ advance ]
            if quotes then
                r = mw.ustring.format( "%s%s%s%s%s",
                                       mw.ustring.char( quotes[ 1 ] ),
                                       space,
                                       apply,
                                       space,
                                       mw.ustring.char( quotes[ 2 ] ) )
            end
        else
            mw.log( "fiatQuote() " .. suite )
        end
    end
    return r
end -- fiatQuote()



Text.char = function ( apply, again, accept )
    -- Create string from codepoints
    -- Parameter:
    --     apply   -- table (sequence) with numerical codepoints, or nil
    --     again   -- number of repetitions, or nil
    --     accept  -- true, if no error messages to be appended
    -- Returns: string
    local r = ""
    apply = type(apply) == "table" and apply or {}
    again = math.floor(tonumber(again) or 1)
    if again < 1 then
    	return ""
    end
    local bad   = { }
    local codes = { }
    for _, v in ipairs( apply ) do
    	local n = tonumber(v)
    	if not n or (n < 32 and n ~= 9 and n ~= 10) then
    		table.insert(bad, tostring(v))
    	else
    		table.insert(codes, math.floor(n))
		end
    end 
    if #bad > 0 then
    	if not accept then
    		r = tostring(  mw.html.create( "span" )
                    		:addClass( "error" )
                    		:wikitext( "bad codepoints: " .. table.concat( bad, " " )) )
    	end
    	return r
    end
    if #codes > 0 then
    	r = mw.ustring.char( unpack( codes ) )
    	if again > 1 then
    		r = r:rep(again)
    	end
	end
    return r
end -- Text.char()

local function trimAndFormat(args, fmt)
	local result = {}
	if type(args) ~= 'table' then
		args = {args}
	end
	for _, v in ipairs(args) do
		v = mw.text.trim(tostring(v))
		if v ~= "" then
			table.insert(result,fmt and mw.ustring.format(fmt, v) or v)
		end
	end
	return result
end

Text.concatParams = function ( args, apply, adapt )
    -- Concat list items into one string
    -- Parameter:
    --     args   -- table (sequence) with numKey=string
    --     apply  -- string (optional); separator (default: "|")
    --     adapt  -- string (optional); format including "%s"
    -- Returns: string
    local collect = { }
    return table.concat(trimAndFormat(args,adapt), apply or "|")
end -- Text.concatParams()



Text.containsCJK = function ( s )
    -- Is any CJK code within?
    -- Parameter:
    --     s  -- string
    -- Returns: true, if CJK detected
    s = s and tostring(s) or ""
    local patternCJK = mw.loadData('Module:Text/data').PatternCJK
    return mw.ustring.find( s, patternCJK ) ~= nil
end -- Text.containsCJK()

Text.removeDelimited = function (s, prefix, suffix)
	-- Remove all text in s delimited by prefix and suffix (inclusive)
	-- Arguments:
	--    s = string to process
	--    prefix = initial delimiter
	--    suffix = ending delimiter
	-- Returns: stripped string
	s = s and tostring(s) or ""
	prefix = prefix and tostring(prefix) or ""
	suffix = suffix and tostring(suffix) or ""
	local prefixLen = mw.ustring.len(prefix)
	local suffixLen = mw.ustring.len(suffix)
	if prefixLen == 0 or suffixLen == 0 then
		return s
	end
	local i = s:find(prefix, 1, true)
	local r = s
	local j
	while i do
		j = r:find(suffix, i + prefixLen)
		if j then
			r = r:sub(1, i - 1)..r:sub(j+suffixLen)
		else
			r = r:sub(1, i - 1)
		end
		i = r:find(prefix, 1, true)
	end
	return r
end

Text.getPlain = function ( adjust )
    -- Remove wikisyntax from string, except templates
    -- Parameter:
    --     adjust  -- string
    -- Returns: string
    local r = Text.removeDelimited(adjust,"<!--","-->")
    r = r:gsub( "(</?%l[^>]*>)", "" )
         :gsub( "'''", "" )
         :gsub( "''", "" )
         :gsub( "&nbsp;", " " )
    return r
end -- Text.getPlain()

Text.isLatinRange = function (s)
    -- Are characters expected to be latin or symbols within latin texts?
    -- Arguments:
    --  s = string to analyze
    -- Returns: true, if valid for latin only
    s = s and tostring(s) or ""  --- ensure input is always string
    local PatternLatin = mw.loadData('Module:Text/data').PatternLatin
    return mw.ustring.match(s, PatternLatin) ~= nil
end -- Text.isLatinRange()



Text.isQuote = function ( s )
    -- Is this character any quotation mark?
    -- Parameter:
    --     s = single character to analyze
    -- Returns: true, if s is quotation mark
    s = s and tostring(s) or ""
    if s == "" then
    	return false
    end
    local SeekQuote = mw.loadData('Module:Text/data').SeekQuote
    return mw.ustring.find( SeekQuote, s, 1, true ) ~= nil
end -- Text.isQuote()



Text.listToText = function ( args, adapt )
    -- Format list items similar to mw.text.listToText()
    -- Parameter:
    --     args   -- table (sequence) with numKey=string
    --     adapt  -- string (optional); format including "%s"
    -- Returns: string
    return mw.text.listToText(trimAndFormat(args, adapt))
end -- Text.listToText()



Text.quote = function ( apply, alien, advance )
    -- Quote text
    -- Parameter:
    --     apply    -- string, with text
    --     alien    -- string, with language code, or nil
    --     advance  -- number, with level 1 or 2, or nil
    -- Returns: quoted string
    apply = apply and tostring(apply) or ""
    local mode, slang
    if type( alien ) == "string" then
        slang = mw.text.trim( alien ):lower()
    else
        slang = mw.title.getCurrentTitle().pageLanguage
        if not slang then
            -- TODO FIXME: Introduction expected 2017-04
            slang = mw.language.getContentLanguage():getCode()
        end
    end
    if advance == 2 then
        mode = 2
    else
        mode = 1
    end
    return fiatQuote( mw.text.trim( apply ), slang, mode )
end -- Text.quote()



Text.quoteUnquoted = function ( apply, alien, advance )
    -- Quote text, if not yet quoted and not empty
    -- Parameter:
    --     apply    -- string, with text
    --     alien    -- string, with language code, or nil
    --     advance  -- number, with level 1 or 2, or nil
    -- Returns: string; possibly quoted
    local r = mw.text.trim( apply and tostring(apply) or "" )
    local s = mw.ustring.sub( r, 1, 1 )
    if s ~= ""  and  not Text.isQuote( s, advance ) then
        s = mw.ustring.sub( r, -1, 1 )
        if not Text.isQuote( s ) then
            r = Text.quote( r, alien, advance )
        end
    end
    return r
end -- Text.quoteUnquoted()



Text.removeDiacritics = function ( adjust )
    -- Remove all diacritics
    -- Parameter:
    --     adjust  -- string
    -- Returns: string; all latin letters should be ASCII
    --                  or basic greek or cyrillic or symbols etc.
    local cleanup, decomposed
    local PatternCombined = mw.loadData('Module:Text/data').PatternCombined
    decomposed = mw.ustring.toNFD( adjust and tostring(adjust) or "" )
    cleanup    = mw.ustring.gsub( decomposed, PatternCombined, "" )
    return mw.ustring.toNFC( cleanup )
end -- Text.removeDiacritics()



Text.sentenceTerminated = function ( analyse )
    -- Is string terminated by dot, question or exclamation mark?
    --     Quotation, link termination and so on granted
    -- Parameter:
    --     analyse  -- string
    -- Returns: true, if sentence terminated
    local r
    local PatternTerminated = mw.loadData('Module:Text/data').PatternTerminated
    if mw.ustring.find( analyse, PatternTerminated ) then
        r = true
    else
        r = false
    end
    return r
end -- Text.sentenceTerminated()



Text.ucfirstAll = function ( adjust)
    -- Capitalize all words
    -- Arguments:
    --     adjust = string to adjust
    -- Returns: string with all first letters in upper case
    adjust = adjust and tostring(adjust) or ""
    local r = mw.text.decode(adjust,true)
    local i = 1
    local c, j, m
    m = (r ~= adjust)
    r = " "..r
    while i do
        i = mw.ustring.find( r, "%W%l", i )
        if i then
            j = i + 1
            c = mw.ustring.upper( mw.ustring.sub( r, j, j ) )
            r = string.format( "%s%s%s",
                               mw.ustring.sub( r, 1, i ),
                               c,
                               mw.ustring.sub( r, i + 2 ) )
            i = j
        end
    end -- while i
    r = r:sub( 2 )
    if m then
    	r = mw.text.encode(r)
    end
    return r
end -- Text.ucfirstAll()


Text.uprightNonlatin = function ( adjust )
    -- Ensure non-italics for non-latin text parts
    --     One single greek letter might be granted
    -- Precondition:
    --     adjust  -- string
    -- Returns: string with non-latin parts enclosed in <span>
    local r
    local data = mw.loadData('Module:Text/data')
    local PatternLatin = data.PatternLatin
    local RangesLatin = data.RangesLatin
    local NumLatinRanges = data.NumLatinRanges
    if mw.ustring.match( adjust, PatternLatin ) then
        -- latin only, horizontal dashes, quotes
        r = adjust
    else
        local c
        local j    = false
        local k    = 1
        local m    = false
        local n    = mw.ustring.len( adjust )
        local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>"
        local flat = function ( a )
                  -- isLatin
                  local range
                  -- NumLatinRanges has to be precomputed because # does not work from loadData
                  for i = 1, NumLatinRanges do
                      range = RangesLatin[ i ]
                      if a >= range[ 1 ]  and  a <= range[ 2 ] then
                          return true
                      end
                  end    -- for i
              end -- flat()
        local focus = function ( a )
                  -- char is not ambivalent
                  local r = ( a > 64 )
                  if r then
                      r = ( a < 8192  or  a > 8212 )
                  else
                      r = ( a == 38  or  a == 60 )    -- '&' '<'
                  end
                  return r
              end -- focus()
        local form = function ( a )
                return string.format( span,
                                      r,
                                      mw.ustring.sub( adjust, k, j - 1 ),
                                      mw.ustring.sub( adjust, j, a ) )
              end -- form()
        r = ""
        for i = 1, n do
            c = mw.ustring.codepoint( adjust, i, i )
            if focus( c ) then
                if flat( c ) then
                    if j then
                        if m then
                            if i == m then
                                -- single greek letter.
                                j = false
                            end
                            m = false
                        end
                        if j then
                            local nx = i - 1
                            local s  = ""
                            for ix = nx, 1, -1 do
                                c = mw.ustring.sub( adjust, ix, ix )
                                if c == " "  or  c == "(" then
                                    nx = nx - 1
                                    s  = c .. s
                                else
                                    break -- for ix
                                end
                            end -- for ix
                            r = form( nx ) .. s
                            j = false
                            k = i
                        end
                    end
                elseif not j then
                    j = i
                    if c >= 880  and  c <= 1023 then
                        -- single greek letter?
                        m = i + 1
                    else
                        m = false
                    end
                end
            elseif m then
                m = m + 1
            end
        end    -- for i
        if j  and  ( not m  or  m < n ) then
            r = form( n )
        else
            r = r .. mw.ustring.sub( adjust, k )
        end
    end
    return r
end -- Text.uprightNonlatin()


Text.test = function ( about )
    local r
    if about == "quote" then
        data = mw.loadData('Module:Text/data')
        r = { }
        r.QuoteLang = data.QuoteLang
        r.QuoteType = data.QuoteType
    end
    return r
end -- Text.test()

-- Non Unicode-aware version of mw.text.split and mw.text.gsplit
-- based on [[phab:diffusion/ELUA/browse/master/includes/Engines/LuaCommon/lualib/mw.text.lua]]
-- These run up to 60 times faster than the Unicode-aware versions
Text.split = function ( text, pattern, plain )
	local ret = {}
	for m in Text.gsplit( text, pattern, plain ) do
		ret[#ret+1] = m
	end
	return ret
end

Text.gsplit = function ( text, pattern, plain )
	local s, l = 1, string.len( text )
	return function ()
		if s then
			local e, n = string.find( text, pattern, s, plain )
			local ret
			if not e then
				ret = string.sub( text, s )
				s = nil
			elseif n < e then
				-- Empty separator!
				ret = string.sub( text, s, e )
				if e < l then
					s = e + 1
				else
					s = nil
				end
			else
				ret = e > s and string.sub( text, s, e - 1 ) or ''
				s = n + 1
			end
			return ret
		end
	end, nil, nil
end

-- Export
local p = { }

for _, func in ipairs({'containsCJK','isLatinRange','isQuote','sentenceTerminated'}) do
	p[func] = function (frame) 
		return Text[func]( frame.args[ 1 ] or "" ) and "1" or ""
	end
end

for _, func in ipairs({'getPlain','removeDiacritics','ucfirstAll','uprightNonlatin'}) do
	p[func] = function (frame) 
		return Text[func]( frame.args[ 1 ] or "" )
	end
end

function p.char( frame )
    local params = frame:getParent().args
    local story = params[ 1 ]
    local codes, lenient, multiple
    if not story then
        params = frame.args
        story  = params[ 1 ]
    end
    if story then
        local items = mw.text.split( mw.text.trim(story), "%s+" )
        if #items > 0 then
            local j
            lenient  = (yesNo(params.errors) == false)
            codes    = { }
            multiple = tonumber( params[ "*" ] )
            for _, v in ipairs( items ) do
            	j = tonumber((v:sub( 1, 1 ) == "x" and "0" or "") .. v)
                table.insert( codes,  j or v )
            end 
        end
    end
    return Text.char( codes, multiple, lenient )
end

function p.concatParams( frame )
    local args
    local template = frame.args.template
    if type( template ) == "string" then
        template = mw.text.trim( template )
        template = ( template == "1" )
    end
    if template then
        args = frame:getParent().args
    else
        args = frame.args
    end
    return Text.concatParams( args,
                              frame.args.separator,
                              frame.args.format )
end


function p.listToFormat(frame)
    local lists = {}
    local pformat = frame.args["format"]
    local sep = frame.args["sep"] or ";"

    -- Parameter parsen: Listen
    for k, v in pairs(frame.args) do
        local knum = tonumber(k)
        if knum then lists[knum] = v end
    end

    -- Listen splitten
    local maxListLen = 0
    for i = 1, #lists do
        lists[i] = mw.text.split(lists[i], sep)
        if #lists[i] > maxListLen then maxListLen = #lists[i] end
    end

    -- Ergebnisstring generieren
    local result = ""
    local result_line = ""
    for i = 1, maxListLen do
        result_line = pformat
        for j = 1, #lists do
            result_line = mw.ustring.gsub(result_line, "%%s", lists[j][i], 1)
        end
        result = result .. result_line
    end

    return result
end



function p.listToText( frame )
    local args
    local template = frame.args.template
    if type( template ) == "string" then
        template = mw.text.trim( template )
        template = ( template == "1" )
    end
    if template then
        args = frame:getParent().args
    else
        args = frame.args
    end
    return Text.listToText( args, frame.args.format )
end



function p.quote( frame )
    local slang = frame.args[2]
    if type( slang ) == "string" then
        slang = mw.text.trim( slang )
        if slang == "" then
            slang = false
        end
    end
    return Text.quote( frame.args[ 1 ] or "",
                       slang,
                       tonumber( frame.args[3] ) )
end



function p.quoteUnquoted( frame )
    local slang = frame.args[2]
    if type( slang ) == "string" then
        slang = mw.text.trim( slang )
        if slang == "" then
            slang = false
        end
    end
    return Text.quoteUnquoted( frame.args[ 1 ] or "",
                               slang,
                               tonumber( frame.args[3] ) )
end


function p.zip(frame)
    local lists = {}
    local seps = {}
    local defaultsep = frame.args["sep"] or ""
    local innersep = frame.args["isep"] or ""
    local outersep = frame.args["osep"] or ""

    -- Parameter parsen
    for k, v in pairs(frame.args) do
        local knum = tonumber(k)
        if knum then lists[knum] = v else
            if string.sub(k, 1, 3) == "sep" then
                local sepnum = tonumber(string.sub(k, 4))
                if sepnum then seps[sepnum] = v end
            end
        end
    end
    -- sofern keine expliziten Separatoren angegeben sind, den Standardseparator verwenden
    for i = 1, math.max(#seps, #lists) do
        if not seps[i] then seps[i] = defaultsep end
    end

    -- Listen splitten
    local maxListLen = 0
    for i = 1, #lists do
        lists[i] = mw.text.split(lists[i], seps[i])
        if #lists[i] > maxListLen then maxListLen = #lists[i] end
    end

    local result = ""
    for i = 1, maxListLen do
        if i ~= 1 then result = result .. outersep end
        for j = 1, #lists do
            if j ~= 1 then result = result .. innersep end
            result = result .. (lists[j][i] or "")
        end
    end
    return result
end


function p.split(frame)
	local text = frame.args.text or frame.args[1] or ''
	local pattern = frame.args.pattern or frame.args[2] or ''
	local plain = yesNo(frame.args.plain or frame.args[3])
	local index = tonumber(frame.args.index) or tonumber(frame.args[4]) or 1
	local a = Text.split(text, pattern, plain)
	if index < 0 then index = #a + index + 1 end
	return a[index]
end


function p.failsafe()
    return Text.serial
end


p.Text = function ()
    return Text
end -- p.Text

return p