最近正在看四哥的黄皮书,看里面so文件解析这一章是用java实现的,为加深理解自己实现了一遍。

写这种文件解析和以前写网络packet解析思路差不多,搞清楚文件格式然后就是代码翻译。下面是ELF文件的头文件结构。ELF文件中只有ELF Header有固定的位置。





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
typedef struct elf32_hdr{
unsigned char e_ident[EI_NIDENT]; //magic number
Elf32_Half e_type; //Ojbect file type
Elf32_Half e_machine; //Architecture
Elf32_Word e_version; //Object file version
Elf32_Addr e_entry; //entry point
Elf32_Off e_phoff; //程序头内容在文件的偏移量
Elf32_off e_shoff; //段头内容在文件的偏移量
Elf32_Word e_flags;
Elf32_Half e_ehsize; //elf头部大小
Elf32_Half e_phentsize; //程序头部表格的表项大小
Elf32_Half e_phnum; //程序头的个数Program header
Elf32_Half e_shentsize; //节区头部表格的表项大小
Elf32_Half e_shnum; //段头的个数Section header
Elf32_Half e_shstrndx; //String段在整个段列表中的索引值
}Elf32_Ehdr;

elfhdr是字典,基本就是key:value的方式存储header的各个属性。每个属性从多少字节开始占几位都是固定的,剩下的就是学习python 文件操作函数和binascii库函数。调试的话目前基本就用print输出,高级点的还不太清楚。。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def init_elf(filename):
f = open(filename,"rb")
global elfhdr
magic = binascii.b2a_hex(f.read(16))
#7f 45 4c 46 -> 127 E L F 判断标识
identify = magic[0:8]
if identify != '7f454c46':
print "the file is not elf!"
exit(0)
else:
elfhdr['magic'] = magic.decode('hex')
#文件offset从16字节开始,向后读取两个得到type
#binascii库是二进制和ascii转化库,decode处理后是0300,小端模式要reverse,
#然后再转化为hex,最后int(str,base),以16进制转化前面的字符串
f.seek(16, 0)
e_type = int(binascii.b2a_hex(f.read(2)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_type'] = e_type
#文件offset从18字节开始,向后读取2个得到machine
f.seek(18, 0)
e_machine = int(binascii.b2a_hex(f.read(2)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_machine'] = e_machine
f.seek(20, 0)
e_version = int(binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_version'] = e_version
f.seek(24, 0)
e_entry = int(binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_entry'] = e_entry
f.seek(28, 0)
e_phoff = int(binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_phoff'] = e_phoff
f.seek(32, 0)
e_shoff = int(binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_shoff'] = e_shoff
f.seek(36, 0)
e_flags = int(binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_flags'] = e_flags
f.seek(40, 0)
e_ehsize = int(binascii.b2a_hex(f.read(2)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_ehsize'] = e_ehsize
f.seek(42, 0)
e_phentsize = int(binascii.b2a_hex(f.read(2)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_phentsize'] = e_phentsize
f.seek(44, 0)
e_phnum = int(binascii.b2a_hex(f.read(2)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_phnum'] = e_phnum
f.seek(46, 0)
e_shentsize = int(binascii.b2a_hex(f.read(2)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_shentsize'] = e_shentsize
f.seek(48, 0)
e_shnum= int(binascii.b2a_hex(f.read(2)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_shnum'] = e_shnum
f.seek(50, 0)
e_shstrndx = int(binascii.b2a_hex(f.read(2)).decode('hex')[::-1].encode('hex'),16)
elfhdr['e_shstrndx'] = e_shstrndx
f.close()

解析函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def parse_elfhdr():
magic = elfhdr['magic'].encode('hex')
#python 2.x print不换行需要后面加,
print "Magic: ",
for i in range(0,len(magic)-2,2):
print magic[i:i+2] + " ",
print
#7f 45 4c 46 01 01 01 00 00 00 00 00 00 00 00
#第五个字节01是判断ELF类别的,0是非法类别,1是ELF32,2是ELF64
el_class = magic[8:10]
if el_class == '01':
print "Class: ELF32"
elif el_class == '02':
print "Class: ELF64"
else:
print "invalid el_class!"
exit(0)
el_data = magic[10:12]
if el_data == '01':
print "Data: little endian"
elif el_data == '02':
print "Data: big endian"
else:
print "invalid el_data!"
el_version = int(magic[12:14])
#int convert to str
el_version = str(el_version)
print "Version: " + el_version + " (current)"
print "Type: " + str(elfhdr['e_type'])
print "Machine " + str(elfhdr['e_machine'])
print "Version " + str(elfhdr['e_version'])
print "入口点地址 " + str(elfhdr['e_entry'])
print "程序头起点 " + str(elfhdr['e_phoff'])
print "Start of section headers: " + str(elfhdr['e_shoff'])
print "标志: " + str(hex(elfhdr['e_flags']))
print "本头的大小: " + str(elfhdr['e_ehsize']) + "字节"
print "程序的大小: " + str(elfhdr['e_phentsize']) + "字节"
print "Number of program headers: " + str(elfhdr['e_phnum'])
print "节头大小: " + str(elfhdr['e_shentsize']) + "字节"
print "节头数量: " + str(elfhdr['e_shnum'])
print "字符串表索引节头: " + str(elfhdr['e_shstrndx'])

Program Header程序头信息:

ELF文件中的程序头部是个结构数组,每个结构描述了一个段或系统准备程序执行所必需的其他信息。解析思路还是要关注ELF Header中的几个关键信息,e_phnum是程序头个数,e_phensize是程序头部表格表项大小,e_phoff是程序头在程序中的偏移量:

1
2
3
4
5
6
7
8
9
10
typedef struct elf32_phdr{
Elf32_Word p_type;
Elf32_Off p_offset;
Elf32_Addr p_vaddr;
Elf32_Addr p_paddr;
Elf32_Word p_filesz;
Elf32_Word p_memsz;
Elf32_Word p_flags;
Elf32_Word p_align;
}Elf32_Phdr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
def parse_program_headers(elf_file):
elfPhdr = {}
programs = []
pro_start = elfhdr['e_phoff']
pro_size = elfhdr['e_phentsize']
pro_num = elfhdr['e_phnum']
f = open(elf_file,'rb')
for i in range(0,pro_num):
pro_start = pro_start + i * pro_size #类似Section Header解析,算出每次Program Header的Offset
f.seek(pro_start, 0)
elfPhdr = {}
elfPhdr['p_type'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfPhdr['p_offset'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfPhdr['p_vaddr'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfPhdr['p_paddr'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfPhdr['p_filesz'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfPhdr['p_memsz'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfPhdr['p_flags'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfPhdr['p_align'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
print elfPhdr
programs.append(elfPhdr)
f.close()

Section Header解析实现和Program Header类似:

1
2
3
4
5
6
7
8
9
10
11
12
typedef struct elf32_shdr{
Elf32_Word sh_name;
Elf32_Word sh_type;
Elf32_Word sh_flags;
Elf32_Addr sh_addr;
Elf32_Off sh_offset;
Elf32_Word sh_size;
Elf32_Word sh_link;
Elf32_Word sh_info;
Elf32_Word sh_addralign;
Elf32_Word sh_entsize;
}Elf32_Shdr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

def parse_section_headers(elf_file):
elfShdr = {}
sections = [] #list来保存每个Section Header信息
sec_start = elfhdr['e_shoff'] #section header在elf文件中起始地偏移量
sec_size = elfhdr['e_shentsize'] #每个Section Header的大小
sec_num = elfhdr['e_shnum'] #Section Header的个数
f = open(elf_file,'rb')
for i in range(0,sec_num):
sec_start = sec_start + i * sec_size #每读完一个Section Header就更新下偏移地址
f.seek(sec_start, 0)
elfShdr = {}
elfShdr['sh_name'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfShdr['sh_type'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfShdr['sh_flags'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfShdr['sh_addr'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfShdr['sh_offset'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfShdr['sh_size'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfShdr['sh_link'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfShdr['sh_info'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfShdr['sh_addralign'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
elfShdr['sh_entsize'] = binascii.b2a_hex(f.read(4)).decode('hex')[::-1].encode('hex')
print elfShdr
sections.append(elfShdr)
f.close()