| | | 1 | | // Copyright (c) Microsoft Corporation. All rights reserved. |
| | | 2 | | // Licensed under the MIT License. See License.txt in the project root for |
| | | 3 | | // license information. |
| | | 4 | | |
| | | 5 | | namespace Microsoft.Azure.Search.Models |
| | | 6 | | { |
| | | 7 | | using System; |
| | | 8 | | using System.Collections.Generic; |
| | | 9 | | using System.Linq; |
| | | 10 | | using System.Text; |
| | | 11 | | using Common; |
| | | 12 | | |
| | | 13 | | /// <summary> |
| | | 14 | | /// Defines extension methods for the IndexingParameters class. |
| | | 15 | | /// </summary> |
| | | 16 | | public static class IndexingParametersExtensions |
| | | 17 | | { |
| | | 18 | | private const string ParsingModeKey = "parsingMode"; |
| | | 19 | | |
| | | 20 | | /// <summary> |
| | | 21 | | /// Specifies that the indexer will index only the blobs with the file name extensions you specify. Each string |
| | | 22 | | /// leading dot. For example, ".pdf", ".docx", etc. If you pass the same file extension to this method and Exclu |
| | | 23 | | /// with that extension will be excluded from indexing (that is, ExcludeFileNameExtensions takes precedence). |
| | | 24 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage" /> for deta |
| | | 25 | | /// </summary> |
| | | 26 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 27 | | /// <param name="extensions">File extensions to include in indexing.</param> |
| | | 28 | | /// <remarks> |
| | | 29 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | | 30 | | /// </remarks> |
| | | 31 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 32 | | public static IndexingParameters IndexFileNameExtensions(this IndexingParameters parameters, params string[] ext |
| | | 33 | | { |
| | 12 | 34 | | if (extensions?.Length > 0) |
| | | 35 | | { |
| | 10 | 36 | | Configure( |
| | 10 | 37 | | parameters, |
| | 10 | 38 | | "indexedFileNameExtensions", |
| | 10 | 39 | | extensions.Select(ValidateExtension).Select(FixUpExtension).ToCommaSeparatedString()); |
| | | 40 | | } |
| | | 41 | | |
| | 6 | 42 | | return parameters; |
| | | 43 | | } |
| | | 44 | | |
| | | 45 | | /// <summary> |
| | | 46 | | /// Specifies that the indexer will not index blobs with the file name extensions you specify. Each string is a |
| | | 47 | | /// leading dot. For example, ".pdf", ".docx", etc. If you pass the same file extension to this method and Index |
| | | 48 | | /// with that extension will be excluded from indexing (that is, this method takes precedence). |
| | | 49 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage" /> for deta |
| | | 50 | | /// </summary> |
| | | 51 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 52 | | /// <param name="extensions">File extensions to exclude from indexing.</param> |
| | | 53 | | /// <remarks> |
| | | 54 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | | 55 | | /// </remarks> |
| | | 56 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 57 | | public static IndexingParameters ExcludeFileNameExtensions(this IndexingParameters parameters, params string[] e |
| | | 58 | | { |
| | 12 | 59 | | if (extensions?.Length > 0) |
| | | 60 | | { |
| | 10 | 61 | | Configure( |
| | 10 | 62 | | parameters, |
| | 10 | 63 | | "excludedFileNameExtensions", |
| | 10 | 64 | | extensions.Select(ValidateExtension).Select(FixUpExtension).ToCommaSeparatedString()); |
| | | 65 | | } |
| | | 66 | | |
| | 6 | 67 | | return parameters; |
| | | 68 | | } |
| | | 69 | | |
| | | 70 | | /// <summary> |
| | | 71 | | /// Specifies which parts of a blob will be indexed by the blob storage indexer. |
| | | 72 | | /// </summary> |
| | | 73 | | /// <remarks> |
| | | 74 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | | 75 | | /// <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage" /> |
| | | 76 | | /// </remarks> |
| | | 77 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 78 | | /// <param name="extractionMode">A <c cref="BlobExtractionMode">BlobExtractionMode</c> value specifying what to |
| | | 79 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 80 | | public static IndexingParameters SetBlobExtractionMode(this IndexingParameters parameters, BlobExtractionMode ex |
| | 4 | 81 | | Configure(parameters, "dataToExtract", (string)extractionMode); |
| | | 82 | | |
| | | 83 | | /// <summary> |
| | | 84 | | /// Tells the indexer to assume that all blobs contain JSON, which it will then parse such that each blob's JSON |
| | | 85 | | /// document in the search index. |
| | | 86 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-index-json-blobs/" /> for details. |
| | | 87 | | /// </summary> |
| | | 88 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 89 | | /// <remarks> |
| | | 90 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | | 91 | | /// </remarks> |
| | | 92 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 93 | | public static IndexingParameters ParseJson(this IndexingParameters parameters) => |
| | 2 | 94 | | Configure(parameters, ParsingModeKey, "json"); |
| | | 95 | | |
| | | 96 | | /// <summary> |
| | | 97 | | /// Tells the indexer to assume that all blobs contain new-line separated JSON, which it will then parse such th |
| | | 98 | | /// will map to a single document in the search index. |
| | | 99 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-index-json-blobs/" /> for details. |
| | | 100 | | /// </summary> |
| | | 101 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 102 | | /// <remarks> |
| | | 103 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | | 104 | | /// </remarks> |
| | | 105 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 106 | | public static IndexingParameters ParseJsonLines(this IndexingParameters parameters) => |
| | 2 | 107 | | Configure(parameters, ParsingModeKey, "jsonLines"); |
| | | 108 | | |
| | | 109 | | /// <summary> |
| | | 110 | | /// Tells the indexer to assume that all blobs contain JSON arrays, which it will then parse such that each JSON |
| | | 111 | | /// map to a single document in the search index. |
| | | 112 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-index-json-blobs" /> for details. |
| | | 113 | | /// </summary> |
| | | 114 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 115 | | /// <param name="documentRoot"> |
| | | 116 | | /// An optional JSON Pointer that tells the indexer how to find the JSON array if it's not the top-level JSON pr |
| | | 117 | | /// parameter is null or empty, the indexer will assume that the JSON array can be found in the top-level JSON p |
| | | 118 | | /// Default is null. |
| | | 119 | | /// </param> |
| | | 120 | | /// <remarks> |
| | | 121 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | | 122 | | /// </remarks> |
| | | 123 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 124 | | public static IndexingParameters ParseJsonArrays(this IndexingParameters parameters, string documentRoot = null) |
| | | 125 | | { |
| | 6 | 126 | | Configure(parameters, ParsingModeKey, "jsonArray"); |
| | | 127 | | |
| | 6 | 128 | | if (!string.IsNullOrEmpty(documentRoot)) |
| | | 129 | | { |
| | 2 | 130 | | Configure(parameters, "documentRoot", documentRoot); |
| | | 131 | | } |
| | | 132 | | |
| | 6 | 133 | | return parameters; |
| | | 134 | | } |
| | | 135 | | |
| | | 136 | | /// <summary> |
| | | 137 | | /// Tells the indexer to assume that all blobs are delimited text files. Currently only comma-separated value (C |
| | | 138 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-index-csv-blobs" /> for details. |
| | | 139 | | /// </summary> |
| | | 140 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 141 | | /// <param name="headers"> |
| | | 142 | | /// Specifies column headers that the indexer will use to map values to specific fields in the search index. If |
| | | 143 | | /// headers, the indexer assumes that the first non-blank line of each blob contains comma-separated headers. |
| | | 144 | | /// </param> |
| | | 145 | | /// <remarks> |
| | | 146 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | | 147 | | /// </remarks> |
| | | 148 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 149 | | public static IndexingParameters ParseDelimitedTextFiles(this IndexingParameters parameters, params string[] hea |
| | | 150 | | { |
| | 4 | 151 | | Configure(parameters, ParsingModeKey, "delimitedText"); |
| | | 152 | | |
| | 4 | 153 | | if (headers?.Length > 0) |
| | | 154 | | { |
| | 2 | 155 | | Configure(parameters, "delimitedTextHeaders", headers.ToCommaSeparatedString()); |
| | | 156 | | } |
| | | 157 | | else |
| | | 158 | | { |
| | 2 | 159 | | Configure(parameters, "firstLineContainsHeaders", true); |
| | | 160 | | } |
| | | 161 | | |
| | 4 | 162 | | return parameters; |
| | | 163 | | } |
| | | 164 | | |
| | | 165 | | /// <summary> |
| | | 166 | | /// Tells the indexer to assume that blobs should be parsed as text files in UTF-8 encoding. |
| | | 167 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage#indexing-pla |
| | | 168 | | /// </summary> |
| | | 169 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 170 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 171 | | public static IndexingParameters ParseText(this IndexingParameters parameters) => |
| | 2 | 172 | | ParseText(parameters, Encoding.UTF8); |
| | | 173 | | |
| | | 174 | | /// <summary> |
| | | 175 | | /// Tells the indexer to assume that blobs should be parsed as text files in the desired encoding. |
| | | 176 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage#indexing-pla |
| | | 177 | | /// </summary> |
| | | 178 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 179 | | /// <param name="encoding">Encoding used to read the text stored in blobs.</param> |
| | | 180 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 181 | | public static IndexingParameters ParseText(this IndexingParameters parameters, Encoding encoding) |
| | | 182 | | { |
| | 4 | 183 | | Throw.IfArgumentNull(encoding, nameof(encoding)); |
| | | 184 | | |
| | 4 | 185 | | Configure(parameters, ParsingModeKey, "text"); |
| | 4 | 186 | | Configure(parameters, "encoding", encoding.WebName); |
| | 4 | 187 | | return parameters; |
| | | 188 | | } |
| | | 189 | | |
| | | 190 | | /// <summary> |
| | | 191 | | /// Specifies that <c cref="BlobExtractionMode.StorageMetadata">BlobExtractionMode.StorageMetadata</c> blob extr |
| | | 192 | | /// automatically used for blobs of unsupported content types. This behavior is enabled by default. |
| | | 193 | | /// </summary> |
| | | 194 | | /// <remarks> |
| | | 195 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | | 196 | | /// </remarks> |
| | | 197 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | | 198 | | /// <returns></returns> |
| | | 199 | | /// <returns>The IndexingParameters instance.</returns> |
| | | 200 | | [Obsolete("This behavior is now enabled by default, so calling this method is no longer necessary.")] |
| | | 201 | | public static IndexingParameters DoNotFailOnUnsupportedContentType(this IndexingParameters parameters) => |
| | 0 | 202 | | Configure(parameters, "failOnUnsupportedContentType", false); |
| | | 203 | | |
| | | 204 | | private static IndexingParameters Configure(IndexingParameters parameters, string key, object value) |
| | | 205 | | { |
| | 40 | 206 | | Throw.IfArgumentNull(parameters, nameof(parameters)); |
| | | 207 | | |
| | 40 | 208 | | if (parameters.Configuration == null) |
| | | 209 | | { |
| | 26 | 210 | | parameters.Configuration = new Dictionary<string, object>(); |
| | | 211 | | } |
| | | 212 | | |
| | 40 | 213 | | parameters.Configuration[key] = value; |
| | 40 | 214 | | return parameters; |
| | | 215 | | } |
| | | 216 | | |
| | | 217 | | private static string ValidateExtension(string extension) |
| | | 218 | | { |
| | 32 | 219 | | if (string.IsNullOrEmpty(extension)) |
| | | 220 | | { |
| | 8 | 221 | | throw new ArgumentException("Extension cannot be null or empty string."); |
| | | 222 | | } |
| | | 223 | | |
| | 24 | 224 | | if (extension.Contains("*")) |
| | | 225 | | { |
| | 4 | 226 | | throw new ArgumentException("Extension cannot contain the wildcard character '*'."); |
| | | 227 | | } |
| | | 228 | | |
| | 20 | 229 | | return extension; |
| | | 230 | | } |
| | | 231 | | |
| | | 232 | | private static string FixUpExtension(string extension) |
| | | 233 | | { |
| | 20 | 234 | | if (!extension.StartsWith(".", StringComparison.Ordinal)) |
| | | 235 | | { |
| | 4 | 236 | | return "." + extension; |
| | | 237 | | } |
| | | 238 | | |
| | 16 | 239 | | return extension; |
| | | 240 | | } |
| | | 241 | | } |
| | | 242 | | } |