A5/1流密码算法的实现与分析
1. 背景简介
关注GSM手机网络安全的人都知道,21年来,全球35亿人一直在使用的传统64位GSM手机网络加密技术最近已经被公开破解,窃听手机现在已经不是国安 局的专利了,任何人都可以利用破解出来的密码本破译截获的手机信息。安全专家们此前曾展示过一套不法分子可用于窃听并破解64位GSM信号的设备,这套设 备的成本低于1000美元(大部分所使用的破解软件都是免费开源软件)。这种情况不禁令普通百姓,甚至持有商业秘密的大公司心惊肉跳。更令人感到害怕的是手机运营商的态度,当28岁的德国电脑工程师Karsten Nohl宣布破解了64位GSM加密系统之后,他们只是承认了这套系统在安全方面存在的问题,却迟迟不肯出台新的加密算法。
现在,连更高级别的手机通讯加密机制--KASUMI系统--这个采用128位A5/3加密算法的系统也已经被人们成功破解,这套系统正被目前新兴的3G网络所使用。相比去年破解64位的A5/1加密系统时采用Nvidia GPU集群花费了数月时间破解出2TB密码本的工作,这次破解行动则使用了更为复杂的算法(related-key sandwich attack)来破解128位加密系统,而且破解的过程只花了2个小时。
这次破解是由以色列Weizmann科学学院( Weizmann Institute of Science )数学系以及计算机科学系的教职员工完成的。参与这次破解的有 Orr Dunkelman, Nathan Keller和Adi Shamir,其中Adi Shamir便是大名鼎鼎的RSA公钥加密算法的发明人之一。他们所使用的破解算法采用了一种不断更换密钥的方法,“我们只需要使用4个相关联的密钥。226个数据,230比特内存,经过232次计算便可以完成破解。计算的复杂性很小,我们使用单台PC机只花了
不到两个小时便模拟了整个破解过程。
不过据Karsten Nohl教授介绍,这种破解方式从有效性上看不如之前的A5/1破解方式,由于这种破解方法必须事先编好数百万条内容已知的明文,然后再把这些明文一一放到运营商的无线网络中进行传输,之后截获这些信息并对运营商的加密方式进行破解,因此虽然破解计算本身速度很快,但传送明文并搜集截获的信息所花费的时间却需要很长。
目前运营商所使用的KASUMI(A5/3)加密算法是在MISTY算法的基础上改进而来。MISTY算法是由三菱的工程师开发出来的,原始的MISTY算法保密性更强,但所需完成的计算任务则比KASUMI算法更为繁重。
一句话,大量的预计算似乎破解A5/3并不容易。(参见reflextor.com/trac/a51)
而本文专注的是A5/1算法的实现,及相应方程系统的描述。
2. A5/1算法结构
直接图示了……
图1 A5/1算法
图2 A5/1算法结构
图3 A5/1算法描述
图4 A5/1算法
3. 代码实现
- #include <stdio.h>
- /*LSB---低字节位*/
- /*MSB---高字节位*/
- /* Masks for the three shift registers */
- #define R1MASK 0x07FFFF /* 19 bits, numbered 0..18 */
- #define R2MASK 0x3FFFFF /* 22 bits, numbered 0..21 */
- #define R3MASK 0x7FFFFF /* 23 bits, numbered 0..22 */
- /* Middle bit of each of the three shift registers, for clock control */
- #define R1MID 0x000100 /* bit 8 */
- #define R2MID 0x000400 /* bit 10 */
- #define R3MID 0x000400 /* bit 10 */
- /* Feedback taps, for clocking the shift registers.
- * These correspond to the primitive polynomials
- * x^19 + x^5 + x^2 + x + 1,
- * x^22 + x + 1,
- * x^23 + x^15 + x^2 + x + 1.
- */
- #define R1TAPS 0x072000 /* bits 18,17,16,13 */
- #define R2TAPS 0x300000 /* bits 21,20 */
- #define R3TAPS 0x700080 /* bits 22,21,20,7 */
- /* Output taps, for output generation */
- /* 高位是高字节 */
- #define R1OUT 0x040000 /* bit 18 (the high bit) */
- #define R2OUT 0x200000 /* bit 21 (the high bit) */
- #define R3OUT 0x400000 /* bit 22 (the high bit) */
- typedef unsigned char byte;
- typedef unsigned long word;
- typedef word bit;
- /* Calculate the parity of a 32-bit word, i.e. the sum of its bits modulo 2 */
- bit parity(word x) {
- x ^= x>>16;
- x ^= x>>8;
- x ^= x>>4;
- x ^= x>>2;
- x ^= x>>1;
- return x&1;
- }
- /*Above is cool,but why?
- int parity(unsigned long ino)
- {
- int noofones = 0;
- unsigned long mask = 0x00000001ul; /* start at first bit */
- while(mask != 0) /* until all bits tested */
- {
- if(mask & ino) /* if bit is 1, increment noofones */
- {
- noofones++;
- }
- mask = mask << 1; /* go to next bit */
- }
- /* if noofones is odd, least significant bit will be 1 */
- return (noofones & 1);
- }
- */
- /* Clock one shift register */
- word clockone(word reg, word mask, word taps) {
- word t = reg & taps;//仅取抽头位,做反馈用
- reg = (reg << 1) & mask;//左移一位,&掩码,则表示只取掩码位
- reg |= parity(t);//反馈位
- return reg;
- }
- /* The three shift registers. They're in global variables to make the code
- * easier to understand.
- * A better implementation would not use global variables. */
- word R1, R2, R3;
- /* Look at the middle bits of R1,R2,R3, take a vote, and
- * return the majority value of those 3 bits. */
- bit majority() {
- int sum;
- sum = parity(R1&R1MID) + parity(R2&R2MID) + parity(R3&R3MID);
- if (sum >= 2)
- return 1;
- else
- return 0;
- }
- /* Clock two or three of R1,R2,R3, with clock control
- * according to their middle bits.
- * Specifically, we clock Ri whenever Ri's middle bit
- * agrees with the majority value of the three middle bits.*/
- void clock() {
- bit maj = majority();
- if (((R1&R1MID)!=0) == maj)//(R1&R1MID)!=0)的返回值是0或1
- R1 = clockone(R1, R1MASK, R1TAPS);
- if (((R2&R2MID)!=0) == maj)
- R2 = clockone(R2, R2MASK, R2TAPS);
- if (((R3&R3MID)!=0) == maj)
- R3 = clockone(R3, R3MASK, R3TAPS);
- }
- /* Clock all three of R1,R2,R3, ignoring their middle bits.
- * This is only used for key setup. */
- void clockallthree() {
- R1 = clockone(R1, R1MASK, R1TAPS);
- R2 = clockone(R2, R2MASK, R2TAPS);
- R3 = clockone(R3, R3MASK, R3TAPS);
- }
- /* Generate an output bit from the current state.
- * You grab a bit from each register via the output generation taps;
- * then you XOR the resulting three bits. */
- bit getbit() {
- return parity(R1&R1OUT)^parity(R2&R2OUT)^parity(R3&R3OUT);
- }
- /* Do the A5/1 key setup. This routine accepts a 64-bit key and
- * a 22-bit frame number. */
- void keysetup(byte key[8], word frame) {
- int i;
- bit keybit, framebit;
- /* Zero out the shift registers. */
- R1 = R2 = R3 = 0;
- /* Load the key into the shift registers,
- * LSB of first byte of key array first,
- * clocking each register once for every
- * key bit loaded. (The usual clock
- * control rule is temporarily disabled.) */
- for (i=0; i<64; i++)
- {
- clockallthree(); /* always clock */
- keybit = (key[i/8] >> (i&7)) & 1; /* The i-th bit of the key */
- R1 ^= keybit; R2 ^= keybit; R3 ^= keybit;
- }
- /* Load the frame number into the shift
- * registers, LSB first,
- * clocking each register once for every
- * key bit loaded. (The usual clock
- * control rule is still disabled.) */
- for (i=0; i<22; i++)
- {
- clockallthree(); /* always clock */
- framebit = (frame >> i) & 1; /* The i-th bit of the frame # */
- R1 ^= framebit; R2 ^= framebit; R3 ^= framebit;
- }
- /* Run the shift registers for 100 clocks
- * to mix the keying material and frame number
- * together with output generation disabled,
- * so that there is sufficient avalanche.
- * We re-enable the majority-based clock control
- * rule from now on. */
- for (i=0; i<100; i++) {
- clock();
- }
- /* Now the key is properly set up. */
- }
- /* Generate output. We generate 228 bits of
- * keystream output. The first 114 bits is for
- * the A->B frame; the next 114 bits is for the
- * B->A frame. You allocate a 15-byte buffer
- * for each direction, and this function fills
- * it in. */
- void run(byte AtoBkeystream[], byte BtoAkeystream[])
- {
- int i;
- /* Zero out the output buffers. */
- for (i=0; i<=113/8; i++)
- AtoBkeystream[i] = BtoAkeystream[i] = 0;
- /* Generate 114 bits of keystream for the
- * A->B direction. Store it, MSB first. */
- /*每个字节的高位先存*/
- for (i=0; i<114; i++) {
- clock();
- AtoBkeystream[i/8] |= getbit() << (7-(i&7));
- }
- /* Generate 114 bits of keystream for the
- * B->A direction. Store it, MSB first. */
- for (i=0; i<114; i++) {
- clock();
- BtoAkeystream[i/8] |= getbit() << (7-(i&7));
- }
- }
- /* Test the code by comparing it against
- * a known-good test vector. */
- void test() {
- byte key[8] = {0x12, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF};
- word frame = 0x134;
- byte goodAtoB[15] = { 0x53, 0x4E, 0xAA, 0x58, 0x2F, 0xE8, 0x15,
- 0x1A, 0xB6, 0xE1, 0x85, 0x5A, 0x72, 0x8C, 0x00 };
- byte goodBtoA[15] = { 0x24, 0xFD, 0x35, 0xA3, 0x5D, 0x5F, 0xB6,
- 0x52, 0x6D, 0x32, 0xF9, 0x06, 0xDF, 0x1A, 0xC0 };
- byte AtoB[15], BtoA[15];
- int i, failed=0;
- keysetup(key, frame);
- run(AtoB, BtoA);
- /* Compare against the test vector. */
- for (i=0; i<15; i++)
- if (AtoB[i] != goodAtoB[i])
- failed = 1;
- for (i=0; i<15; i++)
- if (BtoA[i] != goodBtoA[i])
- failed = 1;
- /* 先打印已知的密钥和帧数. */
- printf("key: 0x");
- for (i=0; i<8; i++)
- printf("%02X", key[i]);
- printf("\n");
- printf("frame number: 0x%06X\n", (unsigned int)frame);
- /* 打印已知的228位密钥流. */
- printf("known good output:\n");
- printf(" A->B: 0x");
- for (i=0; i<15; i++)
- printf("%02X", goodAtoB[i]);
- printf(" B->A: 0x");
- for (i=0; i<15; i++)
- printf("%02X", goodBtoA[i]);
- printf("\n");
- /* 打印计算出的228位密钥流. */
- printf("observed output:\n");
- printf(" A->B: 0x");
- for (i=0; i<15; i++)
- printf("%02X", AtoB[i]);
- printf(" B->A: 0x");
- for (i=0; i<15; i++)
- printf("%02X", BtoA[i]);
- printf("\n");
- if (!failed) {
- printf("Self-check succeeded: everything looks ok.\n");
- return;
- } else {
- /* Problems! The test vectors didn't compare*/
- printf("\nI don't know why this broke; contact the authors.\n");
- exit(1);
- }
- }
- int main(void) {
- test();
- return 0;
- }
4. 试验结果
已知的生成的228位最初密钥,和新生成的228位最初密钥的对比。
5. A5/1算法的方程描述
假设我们获得了A5/1的一系列密钥流,可以把线性移位寄存器中的状态及初始密钥当做变元,随着clock产生密钥流的方程。
先看两个小例子:
这里的线性移位寄存器的反馈函数可表示为矩阵的形式。而输出变元经过组合函数f便生出了密钥流位。这里的方程应该先反馈,再输出。其是先输出后反馈?对上例补一个说明……
再一个例子:
注:生出方程的次数的高低仅与非线性组合函数或过滤函数有关。
下面开始针对A5/1分析:
这个择多函数的描述很巧妙,实际,整个方程的非线性性也就由它来确定。
另对每个LSFR来说,其状态转移方程不像上面小例子乘个矩阵那么简单。因为每clock一下都要考虑到择多函数的干扰。
这里对每个V,根据vi的值去决定移动,一致则移,否则不移。在生出方程的时候,初始用64个变元表示所有3个寄存器的状态。
先看个小例子:
其所产生的方程全都是受状态转移方程的控制,可看到其相对复杂,其大小以指数形式增长,故光存这些多项式就很占内存。