From ccf46a765ceab0c04a3f11bb37fe2241741b00f2 Mon Sep 17 00:00:00 2001 From: "H. Utku Maden" Date: Mon, 26 Aug 2024 21:23:16 +0300 Subject: [PATCH] Improve own database. --- .../Image/ImageMagicValues.cs | 58 ---------- ReMime/ContentResolvers/MagicResolver.cs | 31 +++-- ReMime/ContentResolvers/MagicValue.cs | 106 +++++++++++++++++- .../MagicValueDatabaseEntry.cs | 43 +++++++ ReMime/ContentResolvers/database.jsonc | 62 ++++++++++ ReMime/ReMime.csproj | 4 + 6 files changed, 235 insertions(+), 69 deletions(-) delete mode 100644 ReMime/ContentResolvers/Image/ImageMagicValues.cs create mode 100644 ReMime/ContentResolvers/MagicValueDatabaseEntry.cs create mode 100644 ReMime/ContentResolvers/database.jsonc diff --git a/ReMime/ContentResolvers/Image/ImageMagicValues.cs b/ReMime/ContentResolvers/Image/ImageMagicValues.cs deleted file mode 100644 index ee2b0d6..0000000 --- a/ReMime/ContentResolvers/Image/ImageMagicValues.cs +++ /dev/null @@ -1,58 +0,0 @@ -using System.Collections.Generic; - -namespace ReMime.ContentResolvers.Image -{ - public static class ImageMagicValues - { - private static readonly MediaType Tiff = new MediaType("image/tiff", new string[] { "nif", "tif", "tiff"}); - private static readonly MediaType Jpeg = new MediaType("image/jpeg", new string[] { "jpg", "jpeg" }); - - public static readonly IReadOnlyList List = new List() { - new MagicValueMediaType(new MagicValue("BM"), new MediaType("image/bmp")), - new MagicValueMediaType(new MagicValue("GIF8"), new MediaType("image/gif")), - new MagicValueMediaType(new MagicValue("IIN1"), Tiff), - new MagicValueMediaType(new MagicValue(new byte[] { 0x4d, 0x4d, 0x00, 0x2a }), Tiff), - new MagicValueMediaType(new MagicValue(new byte[] { 0x49, 0x49, 0x2a, 0x00 }), Tiff), - new MagicValueMediaType(new MagicValue(new byte[] { 0x89, 0x50, 0x4e, 0x47 }), new MediaType("image/png")), - - /* Yes this is how we are doing JPEG, I don't want to modify my thing to allow for magic values to be defined in terms of bits. */ - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe0 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe1 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe2 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe3 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe4 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe5 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe6 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe7 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe8 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xe9 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xea }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xeb }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xec }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xed }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xee }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xef }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf0 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf1 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf2 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf3 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf4 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf5 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf6 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf7 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf8 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xf9 }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xfa }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xfb }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xfc }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xfd }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xfe }), Jpeg), - new MagicValueMediaType(new MagicValue(new byte[] { 0xff, 0xd8, 0xff, 0xff }), Jpeg), - }.AsReadOnly(); - - public static void AddToMagicResolver(MagicContentResolver resolver) - { - resolver.AddMagicValues(List); - } - } -} \ No newline at end of file diff --git a/ReMime/ContentResolvers/MagicResolver.cs b/ReMime/ContentResolvers/MagicResolver.cs index 770d689..31330ce 100644 --- a/ReMime/ContentResolvers/MagicResolver.cs +++ b/ReMime/ContentResolvers/MagicResolver.cs @@ -2,10 +2,11 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.IO; +using System.Linq; namespace ReMime.ContentResolvers { - public record MagicValueMediaType(MagicValue Magic, MediaType MediaType); + public record MagicValueMediaType(MediaType MediaType, MagicValue[] MagicValues, string[] Extensions); public class MagicContentResolver : IMediaContentResolver { @@ -21,14 +22,25 @@ namespace ReMime.ContentResolvers public MagicContentResolver() { - Image.ImageMagicValues.AddToMagicResolver(this); + List entries; + + using (Stream str = typeof(MagicContentResolver).Assembly.GetManifestResourceStream("ReMime.ContentResolvers.database.jsonc")!) + { + entries = MagicValueDatabaseEntry.GetEntries(str); + } + + AddMagicValues(entries.Select(x => (MagicValueMediaType)x)); } public IReadOnlyCollection MediaTypes => _mediaTypes.AsReadOnly(); public void AddMagicValue(MagicValueMediaType value) { - _maxBytes = Math.Max(_maxBytes, value.Magic.Value.Length); + if (value.MagicValues.Length != 0) + { + _maxBytes = Math.Max(_maxBytes, value.MagicValues.Select(x => x.Value.Length).Max()); + } + _mediaTypes.Add(value.MediaType); _tree.Add(value); @@ -83,17 +95,14 @@ namespace ReMime.ContentResolvers { get { - if (bytes.Length == 0) + if (bytes.Length == 0 || Children == null) return Node; - if (Children == null) - return null; - byte b = bytes[0]; if (!Children.TryGetValue(b, out Tree? subtree)) { - return null; + return Node; } return subtree[bytes.Slice(1)]; @@ -124,8 +133,10 @@ namespace ReMime.ContentResolvers public void Add(MagicValueMediaType magic) { - ReadOnlySpan bytes = magic.Magic.Value; - AddInternal(magic, bytes); + foreach (var entry in magic.MagicValues) + { + AddInternal(magic, entry.Value); + } } } } diff --git a/ReMime/ContentResolvers/MagicValue.cs b/ReMime/ContentResolvers/MagicValue.cs index bfe9f47..5724932 100644 --- a/ReMime/ContentResolvers/MagicValue.cs +++ b/ReMime/ContentResolvers/MagicValue.cs @@ -1,4 +1,6 @@ using System; +using System.Collections.Generic; +using System.Diagnostics.CodeAnalysis; using System.Text; namespace ReMime.ContentResolvers @@ -6,7 +8,7 @@ namespace ReMime.ContentResolvers /// /// A magic value to identify file types. /// - /// The byte arary that makes up the magic value. + /// The byte array that makes up the magic value. public record struct MagicValue(byte[] Value) { public MagicValue(int value) : this(BitConverter.GetBytes(value)) { } @@ -46,5 +48,107 @@ namespace ReMime.ContentResolvers return hash; } + + public static bool TryParse(ReadOnlySpan magic, [NotNullWhen(true)] out MagicValue? value) + { + List bytes = new List(); + StringBuilder builder = new StringBuilder(); + + value = null; + + for (int i = 0; i < magic.Length; i++) + { + char chr = magic[i]; + char chr2; + switch (chr) + { + case '\'': + builder.Clear(); + + int j; + for (j = i + 1; j < magic.Length; j++) + { + chr = magic[j]; + if (chr == '\'') + { + bytes.AddRange(Encoding.ASCII.GetBytes(builder.ToString())); + break; + } + else if (chr == '\\') + { + if (j+1 >= magic.Length) + return false; + + chr2 = magic[j++]; + + builder.Append(chr2 switch { + 'n' => '\n', + 'r' => '\r', + 'a' => '\a', + 'b' => '\b', + 'f' => 'f', + 'v' => '\v', + '?' => '?', + '\\' => '\\', + '\'' => '\'', + '\"' => '\"', + _ => '\0' + }); + } + else + { + builder.Append(chr); + } + } + + if (j == magic.Length) + { + // ASCII string overrun. + return false; + } + + i = j; + + break; + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + case '8': case '9': case 'A': case 'B': + case 'C': case 'D': case 'E': case 'F': + case 'a': case 'b': case 'c': case 'd': + case 'e': case 'f': + // Misaligned hex string. + if (i+1 >= magic.Length) + return false; + + chr2 = magic[++i]; + bytes.Add((byte)(AsciiToInt(chr) << 4 | AsciiToInt(chr2))); + break; + + case '\n': case '\f': case '\r': case '\t': + case ' ': + // generic whitespace. + continue; + } + } + + // No bytes to match. + if (bytes.Count == 0) + return false; + + value = new MagicValue(bytes.ToArray()); + return true; + + static int AsciiToInt(char a) + { + if (a >= '0' && a <= '9') + return a - '0'; + else if (a >= 'A' && a <= 'F') + return a - 'A' + 10; + else if (a >= 'a' && a <= 'f') + return a - 'a' + 10; + else + return -1; + } + } } } \ No newline at end of file diff --git a/ReMime/ContentResolvers/MagicValueDatabaseEntry.cs b/ReMime/ContentResolvers/MagicValueDatabaseEntry.cs new file mode 100644 index 0000000..f8c1b59 --- /dev/null +++ b/ReMime/ContentResolvers/MagicValueDatabaseEntry.cs @@ -0,0 +1,43 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace ReMime.ContentResolvers +{ + [JsonSerializable(typeof(MagicValueDatabaseEntry))] + public class MagicValueDatabaseEntry + { + [JsonPropertyName("type")] + public string Type { get; set; } = string.Empty; + + [JsonPropertyName("magic")] + public List Magic { get; set; } = new List(); + + [JsonPropertyName("extensions")] + public List Extensions { get; set; } = new List(); + + public static List GetEntries(Stream str) + { + return JsonSerializer.Deserialize>(str, new JsonSerializerOptions() + { + AllowTrailingCommas = true, + ReadCommentHandling = JsonCommentHandling.Skip + }) ?? throw new Exception(); + } + + public static explicit operator MagicValueMediaType(MagicValueDatabaseEntry entry) + { + return new MagicValueMediaType( + new MediaType(entry.Type), + entry.Magic.Select(x => (MagicValue.TryParse(x, out var value), value)) + .Where(x => x.Item1) + .Select(x => (MagicValue)x.value!) + .ToArray(), + entry.Extensions.ToArray() + ); + } + } +} \ No newline at end of file diff --git a/ReMime/ContentResolvers/database.jsonc b/ReMime/ContentResolvers/database.jsonc new file mode 100644 index 0000000..ec554f0 --- /dev/null +++ b/ReMime/ContentResolvers/database.jsonc @@ -0,0 +1,62 @@ +/** + * ReMime Magic Value & File Extension Database + * --------------------------------------------- + * This is a self compiled list of magic values, file extensions and + * their mime-types. Please contribute common file formats if you come + * across them. + * + * This file is only for common file formats that do not need any extra + * detection logic. Do not add major container formats like ZIP or RIFF + * into this list. + */ +[ + // #region application/* + { "type": "application/vnd.rar", "magic": [ "'Rar!'1a07" ], "extensions": [ "rar" ] }, + { "type": "application/postscript", "magic": [ "'%!PS'"], "extensions": [ "ps", "eps", "epsf" ] }, + { "type": "application/pdf", "magic": ["'%PDF-'"], "extensions": [ "pdf" ] }, + // #endregion + + // #region audio/* + { "type": "audio/mp3", "magic": [ "fffb", "fff3", "fff2", "'ID3'" ], "extensions": [ "mp3" ] }, + { "type": "audio/flac", "magic": [ "'fLaC'" ], "extensions": [ "flac" ] }, + { "type": "audio/midi", "magic": [ "'MThd'" ], "extensions": [ "mid", "midi" ] }, + // #endregion + + // #region font/* + { "type": "font/woff", "magic": [ "'wOFF'" ], "extensions": [ "woff" ] }, + { "type": "font/woff2", "magic": [ "'wOF2'" ], "extensions": [ "woff2" ] }, + { "type": "font/ttf", "magic": [ "0001000000" ], "extensions": [ "ttf", "tte", "dfont" ] }, + { "type": "font/otf", "magic": [ "'OTTO'" ], "extensions": [ "otf" ]}, + // #endregion + + // #region image/* + { "type": "image/bmp", "magic": [ "'BM'" ], "extensions": [ "bmp" ] }, + { "type": "image/gif", "magic": [ "'GIF8'" ], "extensions": [ "gif" ] }, + { "type": "image/tiff", "magic": [ "'IIN1'", "4d4d002a", "49492a00"], "extensions": [ "tiff", "tif", "nif" ] }, + { "type": "image/png", "magic": [ "89'PNG'" ], "extensions": [ "png" ] }, + { "type": "image/emf", "magic": [ "01000000" ], "extensions": [ "emf" ] }, + { "type": "image/wmf", "magic": [ "d7cdc69a" ], "extensions": [ "wmf" ] }, + { "type": "image/x-ico", "magic": [ "00000100" ], "extensions": [ "ico" ] }, + { "type": "image/x-qoi", "magic": [ "'qoif'" ], "extensions": [ "qoi" ]}, + + // The JPEG standard allows any magic value from ffd8ffe0 to ffd8ffff. + { + "type": "image/jpeg", + "magic": [ + "ffd8ffe0", "ffd8ffe1", "ffd8ffe2", "ffd8ffe3", + "ffd8ffe4", "ffd8ffe5", "ffd8ffe6", "ffd8ffe7", + "ffd8ffe8", "ffd8ffe9", "ffd8ffea", "ffd8ffeb", + "ffd8ffec", "ffd8ffed", "ffd8ffee", "ffd8ffef", + "ffd8fff0", "ffd8fff1", "ffd8fff2", "ffd8fff3", + "ffd8fff4", "ffd8fff5", "ffd8fff6", "ffd8fff7", + "ffd8fff8", "ffd8fff9", "ffd8fffa", "ffd8fffb", + "ffd8fffc", "ffd8fffd", "ffd8fffe", "ffd8ffff" + ], + "extensions": [ "jpeg", "jpg"] + }, + // #endregion + + // #region text/* + { "type": "text/rtf", "magic": [ "'{\\rtf1'" ], "extensions": [ "rtf" ]} + // #endregion +] \ No newline at end of file diff --git a/ReMime/ReMime.csproj b/ReMime/ReMime.csproj index bdafc84..8fefd8e 100644 --- a/ReMime/ReMime.csproj +++ b/ReMime/ReMime.csproj @@ -6,4 +6,8 @@ enable + + + +