// main.c
// gcc -g -o main main.c
#include <stdlib.h>
static int *ptr = NULL;
int func(int len, int index)
{
    ptr = malloc(len * sizeof(int));
    return index;
}
int main()
{
    ptr[func(1, 0)] = 1234;
    return 0;
}
在 Skynet 生活群有同学反应上述代码编译后在某些平台会崩掉,有些平台不会崩掉,想知道为什么。
我自己在 Ubuntu 16.04 上测试,发现也是会崩溃。
随后有大佬说这属于未定义的行为,取决于编译器具体的实现,所以才会有些平台会崩掉,有些不会崩掉。大佬还提到,要学会阅读标准。
写这篇博客主要做两件事情:
- 通过反汇编二进制文件了解为何上述代码生成的可执行文件会崩溃。
- 通过阅读标准文档了解为何此行为属于未定义,学会在标准中寻找依据。
语法分析
int a[10]; 定义了一个 int 类型数组 a ,包含 10 个元素。a[i] 表示第 i 个元素。
以下引用内容来自 The C Programming Language - 5.3 Pointers and Arrays 小节。
The notation a[i] refers to the i-th element of the array. Rather more surprising, at first sight, is the fact that a reference to a[i] can also be written as *(a+i). In evaluating a[i], C converts it to *(a+i) immediately; the two forms are equivalent. In short, an array-and-index expression is equivalent to one written as a pointer and offset.
根据上述说明,ptr[func(1, 0)] = 1234; 等同于 *(ptr + func(1, 0)) = 1234; 。该语句含义是使用解引用操作符 * 向指定地址写入数据,而指定地址是表达式 ptr + func(1, 0) 的求值结果。
- 若表达式先执行 func(1, 0),此时ptr已被赋值,然后再加上ptr,则此时求值结果是分配后的 ptr 的地址,程序正确执行。
- 若表达式先执行 ptr,即先获取ptr的值,此时是0,然后再加上func(1, 0),则此时求值结果是0,程序执行向地址0写入数据,从而进程崩溃。
因此,文章开头的代码能正确执行,是编译器碰巧对表达式 ptr + func(1, 0) 中加法操作符的两个操作数 ptr 和 func(1, 0) 的求值时,先执行了 func(1, 0) 。
先说结论,在 C 中,是未定义对操作符的操作数的求值顺序的。所以,文章开头代码的问题是未定义行为造成的。
下面看一下具体的执行指令。
查看 object file 反汇编
执行 gcc -g -c main.c 仅编译,生成 main.o 。然后执行 objdump -drCwS main.o 查看 main.o 反汇编代码:
Disassembly of section .text:
0000000000000000 <func>:
static int *ptr = NULL;
int func(int len, int index)
{
   0:   55                      push   %rbp
   1:   48 89 e5                mov    %rsp,%rbp
   4:   48 83 ec 10             sub    $0x10,%rsp
   8:   89 7d fc                mov    %edi,-0x4(%rbp) ; 参数 len
   b:   89 75 f8                mov    %esi,-0x8(%rbp) ; 参数 index
    ptr = malloc(len * sizeof(int));
   e:   8b 45 fc                mov    -0x4(%rbp),%eax ; 获取 len 值
  11:   48 98                   cltq
  13:   48 c1 e0 02             shl    $0x2,%rax ; 计算 len * sizeof(int)
  17:   48 89 c7                mov    %rax,%rdi ; 通过 %rdi 传递参数给 malloc
  1a:   e8 00 00 00 00          callq  1f <func+0x1f>   1b: R_X86_64_PC32       malloc-0x4
  1f:   48 89 05 00 00 00 00    mov    %rax,0x0(%rip)        # 26 <func+0x26>   22: R_X86_64_PC32       .bss-0x4
    return index;
  26:   8b 45 f8                mov    -0x8(%rbp),%eax ; 返回参数 index
}
  29:   c9                      leaveq
  2a:   c3                      retq
000000000000002b <main>:
int main()
{
  2b:   55                      push   %rbp
  2c:   48 89 e5                mov    %rsp,%rbp
  2f:   53                      push   %rbx
  30:   48 83 ec 08             sub    $0x8,%rsp
    ptr[func(1, 0)] = 1234;
  34:   48 8b 1d 00 00 00 00    mov    0x0(%rip),%rbx        # 3b <main+0x10>   37: R_X86_64_PC32       .bss-0x4
  3b:   be 00 00 00 00          mov    $0x0,%esi ; 给 func 参数 index 传递值 0
  40:   bf 01 00 00 00          mov    $0x1,%edi ; 给 func 参数 len 传递值 1
  45:   e8 00 00 00 00          callq  4a <main+0x1f>   46: R_X86_64_PC32       func-0x4
  4a:   48 98                   cltq
  4c:   48 c1 e0 02             shl    $0x2,%rax ; 根据 func 返回值计算偏移字节
  50:   48 01 d8                add    %rbx,%rax ; 计算 ptr + func(1, 0) 结果存在 %rax 中
  53:   c7 00 d2 04 00 00       movl   $0x4d2,(%rax) ; 向指定地址写入数据 1234
    return 0;
  59:   b8 00 00 00 00          mov    $0x0,%eax
  5e:   48 83 c4 08             add    $0x8,%rsp
  62:   5b                      pop    %rbx
  63:   5d                      pop    %rbp
  64:   c3                      retq
最左侧 0: 到 64: 是 .text 节偏移字节,标识指令的起始偏移地址。
在函数 main 中:
地址 34: 处,指令 mov 0x0(%rip),%rbx 用于获取变量 ptr 的值,此时还未对 ptr 赋值,获取的值为 0 。至于 0x0(%rip) 表示重定位,链接器生成可执行文件时,会修改这里,真正引用到 ptr 的内存地址。
地址 3b: - 45: 处,指令用于通过寄存器传递参数,然后调用 func 函数。
地址 4a: - 4c: 处,处理 func 函数返回值,并计算偏移字节。func 函数返回值在 %eax 寄存器中(因为返回值是 int 类型),而我用的 64 位系统,因此提升到 %rax 寄存器,而一个 int 类型占用 4 字节,所以 shl $0x2,%rax 指索引 * 4 计算偏移字节。
地址 50: 处,指令 add %rbx,%rax 就是表达式 ptr + func(1, 0) 值,而分析发现表达式先计算了 ptr 的值,所以这个表达式的结果是 0 。
地址 53: 处,指令 movl $0x4d2,(%rax) 会向地址 0 写入数据 1234 ,造成内存写入错误,进程崩溃。
经过对执行指令的分析,下面看一下运行时相关寄存器的值,进一步确认。
执行 gcc -g -o main main.c 生成可执行文件,然后执行 gdb main 开始调试。
查看 main 函数反汇编代码,和上述 main.o 反汇编代码类型,只是相应的地址会变成运行时地址。
(gdb) disassemble main
Dump of assembler code for function main:
   0x0000000000400551 <+0>:     push   %rbp
   0x0000000000400552 <+1>:     mov    %rsp,%rbp
   0x0000000000400555 <+4>:     push   %rbx
   0x0000000000400556 <+5>:     sub    $0x8,%rsp
   0x000000000040055a <+9>:     mov    0x200adf(%rip),%rbx        # 0x601040 <ptr>
   0x0000000000400561 <+16>:    mov    $0x0,%esi ; 给 func 参数 index 传递值 0
   0x0000000000400566 <+21>:    mov    $0x1,%edi ; 给 func 参数 len 传递值 1
   0x000000000040056b <+26>:    callq  0x400526 <func>
   0x0000000000400570 <+31>:    cltq
   0x0000000000400572 <+33>:    shl    $0x2,%rax ; 根据 func 返回值计算偏移字节
   0x0000000000400576 <+37>:    add    %rbx,%rax ; 计算 ptr + func(1, 0) 结果存在 %rax 中
   0x0000000000400579 <+40>:    movl   $0x4d2,(%rax)
   0x000000000040057f <+46>:    mov    $0x0,%eax
   0x0000000000400584 <+51>:    add    $0x8,%rsp
   0x0000000000400588 <+55>:    pop    %rbx
   0x0000000000400589 <+56>:    pop    %rbp
   0x000000000040058a <+57>:    retq
End of assembler dump.
地址 0x000000000040055a: 处,获取变量 ptr 的值。前面提到了链接器重定位,观察指令 mov 0x200adf(%rip),%rbx 发现,ptr 的地址由 main.o 中 0x0(%rip) 变成了 0x200adf(%rip) ,计算 %rip + 0x200adf = 0x0000000000400561 +0x200adf = 0x601040 得到 ptr 的内存地址。%rip 是下一条指令的地址。
在地址 0x0000000000400561: 处设置断点,查看寄存器 %rbx 值。
(gdb) break *0x0000000000400561
Breakpoint 1 at 0x400561: file test.c, line 14.
(gdb) r
Starting program: /home/nemo/work/just/t
Breakpoint 1, 0x0000000000400561 in main () at test.c:14
14          *(ptr + func(1, 0)) = 1234;
(gdb) p $rbx
$1 = 0
(gdb) p $rip
$2 = (void (*)()) 0x400561 <main+16>
在地址 0x0000000000400576: 处设置断点,查看 %rax 值。接着输入 (gdb) si 执行下一条指令,查看表达式的值。
(gdb) break *0x0000000000400576
Breakpoint 2 at 0x400576: file test.c, line 14.
(gdb) c
Continuing.
Breakpoint 2, 0x0000000000400576 in main () at test.c:14
14          *(ptr + func(1, 0)) = 1234;
(gdb) p $rax
$3 = 0
(gdb) si
0x0000000000400579      14          *(ptr + func(1, 0)) = 1234;
(gdb) p $rax
$4 = 0
于是在执行地址 0x0000000000400579: 处的指令前,表达式的值是 0 。继续执行则出现了内存写入错误,进程退出。
(gdb) c
Continuing.
Program received signal SIGSEGV, Segmentation fault.
0x0000000000400579 in main () at test.c:14
14          *(ptr + func(1, 0)) = 1234;
这就是程序整个执行过程。
了解了执行过程后,来阅读一下标准,养成阅读标准解决问题的习惯。
Evaluation Order
以下引用内容来自 The C Programming Language - 2.12 Precedence and Order of Evaluation :
C, like most languages, does not specify the order in which the operands of an operator are evaluated. (The exceptions are &&, ||, ?:, and `,'.)
因此可知,C 标准中并未指定对操作符的操作数进行求值的顺序。详情可阅读书中章节。
此外 cppreference 也给出了 C 求值顺序说明,也表明了并未指定求值顺序。
Order of evaluation of the operands of any C operator, including the order of evaluation of function arguments in a function-call expression, and the order of evaluation of the subexpressions within any expression is unspecified (except where noted below). There is no concept of left-to-right or right-to-left evaluation in C, which is not to be confused with left-to-right and right-to-left associativity of operators: the expression f1() + f2() + f3() is parsed as (f1() + f2()) + f3() due to left-to-right associativity of operator+, but the function call to f3 may be evaluated first, last, or between f1() or f2() at run time.
上面提到,不要把操作符的结合性和对操作符的操作数求值的顺序混淆。
下面想聊聊阅读求值顺序涉及到的一些术语。
Side Effect : In computer science, an operation, function or expression is said to have a side effect if it modifies some state variable value(s) outside its local environment, that is to say has an observable effect besides returning a value (the main effect) to the invoker of the operation. Example side effects include modifying a non-local variable, modifying a static local variable, modifying a mutable argument passed by reference, performing I/O or calling other side-effect functions.
Sequence Point : A sequence point defines any point in a computer program's execution at which it is guaranteed that all side effects of previous evaluations will have been performed, and no side effects from subsequent evaluations have yet been performed.
简单理解 side effect 就是指比如 int a = i++; 中的 i++ 就带有 side effect ,因为执行后 i 的值增 1 了。而 简单理解 sequence point 就是指比如 int a = i++; int b = j++; 就是 i 自增后,才会执行后续的 int b = j++; 语句。当前这里的前提都是单线程环境。
根据此可知:With C++11, usage of the term sequence point has been replaced by sequencing. There are three possibilities:
- An expression's evaluation can be sequenced before that of another expression, or equivalently the other expression's evaluation is sequenced after that of the first.
- The expressions' evaluation is indeterminately sequenced, meaning one is sequenced before the other, but which is unspecified.
- The expressions' evaluation is unsequenced.
乍一看 indeterminately sequenced 和 unsequenced 好像没有区别,区别是对于 indeterminately sequenced 机器指令不会有交织 interleave ,只是执行有顺序,而对于 unsequenced 机器指令可含有可以顺序,包括交织。
细节在这里 有描述:
"sequenced-before" is an asymmetric, transitive, pair-wise relationship between evaluations within the same thread (it may extend across threads if atomic types and memory barriers are involved).
- 针对 sequence point 描述:
If a sequence point is present between the subexpressions E1 and E2, then both value computation and side effects of E1 are sequenced-before every value computation and side effect of E2
- 针对 A 先于 B 计算:
If evaluation A is sequenced before evaluation B, then evaluation of A will be complete before evaluation of B begins.
- 针对 B 先于 A 计算:
If A is not sequenced before B and B is sequenced before A, then evaluation of B will be complete before evaluation of A begins.
- 若 A 和 B 的计算顺序无法确定,则存在两种情况:unsequenced 和 indeterminately sequenced :
If A is not sequenced before B and B is not sequenced before A, then two possibilities exist: ■ evaluations of A and B are unsequenced: they may be performed in any order and may overlap (within a single thread of execution, the compiler may interleave the CPU instructions that comprise A and B) ■ evaluations of A and B are indeterminably-sequenced: they may be performed in any order but may not overlap: either A will be complete before B, or B will be complete before A. The order may be the opposite the next time the same expression is evaluated.
下面是标准中定义的一些顺序,这里不全部列出了,下文中的 function designator 是指函数调用操作符 () 。
- There is a sequence point after the evaluation of all function arguments and of the function designator, and before the actual function call.
- There is a sequence point after evaluation of the first (left) operand and before evaluation of the second (right) operand of the following binary operators: && (logical AND), || (logical OR), and , (comma).
总之,阅读标准虽然很枯燥,但读进去之后就会读得津津有味。要多练习在标准中寻找依据,解决实际的问题。
 
  
  
  
 
 
  
 
 
 