判断文件类型【一】

判断文件类型【一】

引言

现有一文件,其扩展名未知或标记错误。假设它是一个正常的、非空的文件,且将扩展名更正后可以正常使用,那么,如何判断它是哪种类型的文件?

思路

使用文件签名来判断实际文件类型,又称magic numbersMagic Bytes。关于文件签名介绍,参考List of file signatures

Magic Number:

  • magic number是嵌入在文件开头或附近的数字,指示其文件格式(即文件的类型)。
  • 我们看不到这个数字。
  • 每个文件都有一个数字,代表文件类型的名称,它是十六进制格式。

C#实现参考

C#实现

example.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
public class MimeType
{
private static readonly byte[] BMP = { 66, 77 };
private static readonly byte[] DOC = { 208, 207, 17, 224, 161, 177, 26, 225 };
private static readonly byte[] EXE_DLL = { 77, 90 };
private static readonly byte[] GIF = { 71, 73, 70, 56 };
private static readonly byte[] ICO = { 0, 0, 1, 0 };
private static readonly byte[] JPG = { 255, 216, 255 };
private static readonly byte[] MP3 = { 255, 251, 48 };
private static readonly byte[] OGG = { 79, 103, 103, 83, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0 };
private static readonly byte[] PDF = { 37, 80, 68, 70, 45, 49, 46 };
private static readonly byte[] PNG = { 137, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82 };
private static readonly byte[] RAR = { 82, 97, 114, 33, 26, 7, 0 };
private static readonly byte[] SWF = { 70, 87, 83 };
private static readonly byte[] TIFF = { 73, 73, 42, 0 };
private static readonly byte[] TORRENT = { 100, 56, 58, 97, 110, 110, 111, 117, 110, 99, 101 };
private static readonly byte[] TTF = { 0, 1, 0, 0, 0 };
private static readonly byte[] WAV_AVI = { 82, 73, 70, 70 };
private static readonly byte[] WMV_WMA = { 48, 38, 178, 117, 142, 102, 207, 17, 166, 217, 0, 170, 0, 98, 206, 108 };
private static readonly byte[] ZIP_DOCX = { 80, 75, 3, 4 };

public static string GetMimeType(byte[] file, string fileName)
{

string mime = "application/octet-stream"; //DEFAULT UNKNOWN MIME TYPE

//Ensure that the filename isn't empty or null
if (string.IsNullOrWhiteSpace(fileName))
{
return mime;
}

//Get the file extension
string extension = Path.GetExtension(fileName) == null
? string.Empty
: Path.GetExtension(fileName).ToUpper();

//Get the MIME Type
if (file.Take(2).SequenceEqual(BMP))
{
mime = "image/bmp";
}
else if (file.Take(8).SequenceEqual(DOC))
{
mime = "application/msword";
}
else if (file.Take(2).SequenceEqual(EXE_DLL))
{
mime = "application/x-msdownload"; //both use same mime type
}
else if (file.Take(4).SequenceEqual(GIF))
{
mime = "image/gif";
}
else if (file.Take(4).SequenceEqual(ICO))
{
mime = "image/x-icon";
}
else if (file.Take(3).SequenceEqual(JPG))
{
mime = "image/jpeg";
}
else if (file.Take(3).SequenceEqual(MP3))
{
mime = "audio/mpeg";
}
else if (file.Take(14).SequenceEqual(OGG))
{
if (extension == ".OGX")
{
mime = "application/ogg";
}
else if (extension == ".OGA")
{
mime = "audio/ogg";
}
else
{
mime = "video/ogg";
}
}
else if (file.Take(7).SequenceEqual(PDF))
{
mime = "application/pdf";
}
else if (file.Take(16).SequenceEqual(PNG))
{
mime = "image/png";
}
else if (file.Take(7).SequenceEqual(RAR))
{
mime = "application/x-rar-compressed";
}
else if (file.Take(3).SequenceEqual(SWF))
{
mime = "application/x-shockwave-flash";
}
else if (file.Take(4).SequenceEqual(TIFF))
{
mime = "image/tiff";
}
else if (file.Take(11).SequenceEqual(TORRENT))
{
mime = "application/x-bittorrent";
}
else if (file.Take(5).SequenceEqual(TTF))
{
mime = "application/x-font-ttf";
}
else if (file.Take(4).SequenceEqual(WAV_AVI))
{
mime = extension == ".AVI" ? "video/x-msvideo" : "audio/x-wav";
}
else if (file.Take(16).SequenceEqual(WMV_WMA))
{
mime = extension == ".WMA" ? "audio/x-ms-wma" : "video/x-ms-wmv";
}
else if (file.Take(4).SequenceEqual(ZIP_DOCX))
{
mime = extension == ".DOCX" ? "application/vnd.openxmlformats-officedocument.wordprocessingml.document" : "application/x-zip-compressed";
}

return mime;
}


}

进制转换

摘录部份常用文件HEX签名,来源GCK’S FILE SIGNATURES TABLE

十六进制签名 ASCII 签名
文件扩展名 文件描述
50 4B 03 04 PK..
ZIP PKZIP archive file (Ref. 1 Ref. 2)
Trailer: filename 50 4B 17 characters 00 00 00
Trailer: (filename PK 17 characters …)
Note: PK are the initals of Phil Katz, co-creator of the ZIP file format and author of PKZIP.
ZIP Apple Mac OS X Dashboard Widget, Aston Shell theme, Oolite eXpansion Pack,
Opera Widget, Pivot Style Template, Rockbox Theme package, Simple Machines
Forums theme, SubEthaEdit Mode, Trillian zipped skin, Virtual Skipper skin
APK Android package
JAR Java archive; compressed file package for classes and data
KMZ Google Earth saved working session file
KWD KWord document
ODT, ODP, OTT OpenDocument text document, presentation, and text document template, respectively.
OXPS Microsoft Open XML paper specification file
SXC, SXD, SXI, SXW OpenOffice spreadsheet (Calc), drawing (Draw), presentation (Impress),and word processing (Writer) files, respectively.
SXC StarOffice spreadsheet
WMZ Windows Media compressed skin file
XPI Mozilla Browser Archive
XPS XML paper specification file
XPT eXact Packager Models

HEX(十六进制)转换byte数组

javascript在线计算:

资料:
Mime-Detective

文件签名表格:
GCK’S FILE SIGNATURES TABLE

参考资料:
Magic number
List of file signatures

拓展阅读:
byte为什么要与上0xff?

作者

zhang

发布于

2021-12-01

更新于

2023-09-19

许可协议

CC BY-NC-SA 4.0

Your browser is out-of-date!

Update your browser to view this website correctly.&npsb;Update my browser now

×