Lua 的字符串 #
Lua 版本 5.3.4
1 字符串的数据结构
1.1 字符串分类
从 5.2.0版本开始,Lua 开始区分长字符串和短字符串,“长短”长度的标准定义在 llimits.h #define LUAI_MAXSHORTLEN 40
“长短” 类型的定义在 lobject.h
/* Variant tags for strings */
#define LUA_TSHRSTR (LUA_TSTRING | (0 << 4)) /* short strings */
#define LUA_TLNGSTR (LUA_TSTRING | (1 << 4)) /* long strings */
1.2 字符串的结构
/*
** Header for string value; string bytes follow the end of this structure
** (aligned according to 'UTString'; see next).
** **字符串的头部,字符串的真正内容在这个结构后面**
*/
typedef struct TString {
CommonHeader;
lu_byte extra; /* reserved words for short strings; "has hash" for longs */
/* 对于短字符串:这个标示是否是保留字,长字符串:是否已经哈希① */
lu_byte shrlen; /* 短字符串的长度 */
unsigned int hash;
union {
size_t lnglen; /* 长字符串的长度 */
struct TString *hnext; /* 短字符串:linked list for hash table */
} u;
} TString;
/*
** Ensures that address after this type is always fully aligned.
*/
typedef union UTString {
L_Umaxalign dummy; /* 内存对齐 */
TString tsv;
} UTString;
补充1: 字符串申请内存大小 #define sizelstring(l) (sizeof(union UTString) + ((l) + 1) * sizeof(char))
说明①: 长字符串是惰性求hast值
unsigned int luaS_hashlongstr (TString *ts) {
lua_assert(ts->tt == LUA_TLNGSTR);
if (ts->extra == 0) { /* no hash? */
ts->hash = luaS_hash(getstr(ts), ts->u.lnglen, ts->hash);
ts->extra = 1; /* now it has its hash */
}
return ts->hash;
}
2.Lua hash 的计算
在lua 5.2.0 之前,lua的哈希计算十分简单(在 lstring.c luaS_newlstr实现),很容易根据其过程构造大量hash值一样的,但是各自内容不相同的字符串。这容易造成hash dos。(导致字符串哈希表拥有一个很大的链表,查找,修改数据性能下降)
接下来说说5.3.4版本的hash 计算函数。后续版本 为了解决这一安全因素加入了随机种子。
2.2.1 字符串随机种子的实现
全局表中的随机种子: g->seed = makeseed(L);
#define luai_makeseed() cast(unsigned int, time(NULL))
/*
** Compute an initial seed as random as possible. Rely on Address Space
** Layout Randomization (if present) to increase randomness..
** 利用了时间和空间地址的随机性设计的种子
*/
#define addbuff(b,p,e) \
{
size_t t = cast(size_t, e); \
memcpy(b + p, &t, sizeof(t));
p += sizeof(t);
}
static unsigned int makeseed (lua_State *L) {
char buff[4 * sizeof(size_t)];
unsigned int h = luai_makeseed();
int p = 0;
addbuff(buff, p, L); /* heap variable */
addbuff(buff, p, &h); /* local variable */
addbuff(buff, p, luaO_nilobject); /* global variable */
addbuff(buff, p, &lua_newstate); /* public function */
lua_assert(p == sizeof(buff));
return luaS_hash(buff, p, h);
}
2.2.2 字符串hash值的计算
unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
unsigned int h = seed ^ cast(unsigned int, l);
size_t step = (l >> LUAI_HASHLIMIT) + 1; // LUAI_HASHLIMIT 默认为5
for (; l >= step; l -= step)
h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
return h;
}
3.字符串的创建
/*
** new string (with explicit length)
** 从下面代码我们可以看出创建字符串根据长度分两种
*/
TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
if (l <= LUAI_MAXSHORTLEN) /* short string? */
return internshrstr(L, str, l);
else {
TString *ts;
if (l >= (MAX_SIZE - sizeof(TString))/sizeof(char))
luaM_toobig(L);
ts = luaS_createlngstrobj(L, l);
memcpy(getstr(ts), str, l * sizeof(char));
return ts;
}
}
下面是创建新的长短字符串对象的函数。
/*
** creates a new string object
*/
static TString *createstrobj (lua_State *L, size_t l, int tag, unsigned int h) {
TString *ts;
GCObject *o;
size_t totalsize; /* total size of TString object */
totalsize = sizelstring(l);
o = luaC_newobj(L, tag, totalsize);
ts = gco2ts(o);
ts->hash = h;
ts->extra = 0;
getstr(ts)[l] = '\0'; /* ending 0 */
return ts;
}
补充1: 短字符串的内部化
短字符串放在全局的字符创表中
global_State->strt;
其中strt结构如下:
typedef struct stringtable {
TString **hash;
int nuse; /* number of elements */
int size;
} stringtable;
--------------------- internshrstr ----------------------------
/*
** 先检查是否已经有这样的字符串存在,如果存在,重复利用,否则新建
*/
static TString *internshrstr (lua_State *L, const char *str, size_t l) {
TString *ts;
global_State *g = G(L);
unsigned int h = luaS_hash(str, l, g->seed);
TString **list = &g->strt.hash[lmod(h, g->strt.size)];
lua_assert(str != NULL); /* otherwise 'memcmp'/'memcpy' are undefined */
for (ts = *list; ts != NULL; ts = ts->u.hnext) {
if (l == ts->shrlen &&
(memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
/* found! */
if (isdead(g, ts)) /* dead (but not collected yet)? */
changewhite(ts); /* resurrect it */
return ts;
}
}
if (g->strt.nuse >= g->strt.size && g->strt.size <= MAX_INT/2) {
luaS_resize(L, g->strt.size * 2);
list = &g->strt.hash[lmod(h, g->strt.size)]; /* recompute with new size */
}
ts = createstrobj(L, l, LUA_TSHRSTR, h);
memcpy(getstr(ts), str, l * sizeof(char));
ts->shrlen = cast_byte(l);
ts->u.hnext = *list;
*list = ts;
g->strt.nuse++;
return ts;
}
4. 字符串的比较
在已知上面的基础上,思考:如果自己实现Lua字符串的比较,会怎么去实现?
短字符串如何比较相等,长字符串呢?
4.1 短字符串
相同的短字符串在Lua中只会存在一份,那么我们直接根据二者的物理地址比较就可以,非常高效。 #define eqshrstr(a,b) check_exp((a)->tt == LUA_TSHRSTR, (a) == (b))
4.2 长字符串
int luaS_eqlngstr (TString *a, TString *b) {
size_t len = a->u.lnglen;
lua_assert(a->tt == LUA_TLNGSTR && b->tt == LUA_TLNGSTR);
return (a == b) || /* same instance or... */
((len == b->u.lnglen) && /* equal length and ... */
(memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */
}
5. 字符串表的调整 ##
// newsize 总是2的指数
void luaS_resize (lua_State *L, int newsize) {
int i;
stringtable *tb = &G(L)->strt;
if (newsize > tb->size) { /* grow table if needed */
luaM_reallocvector(L, tb->hash, tb->size, newsize, TString *);
for (i = tb->size; i < newsize; i++)
tb->hash[i] = NULL;
}
for (i = 0; i < tb->size; i++) { /* rehash */
TString *p = tb->hash[i];
tb->hash[i] = NULL;
while (p) { /* for each node in the list */
TString *hnext = p->u.hnext; /* save next */
unsigned int h = lmod(p->hash, newsize); /* new position */
p->u.hnext = tb->hash[h]; /* chain it */
tb->hash[h] = p;
p = hnext;
}
}
if (newsize < tb->size) { /* shrink table if needed */
/* vanishing slice should be empty */
lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL);
luaM_reallocvector(L, tb->hash, tb->size, newsize, TString *);
}
tb->size = newsize;
}
Lua源码中调用这个函数的有三个地方:1. GC的时候,2.增加短字符串 3.初始化Lua字符串环境(luaS_init)
参考资料
<1> Lua 源码 5.3.4
<2> 云峰 https://www.codingnow.com/temp/readinglua.pdf