| | 1 | | // Copyright (c) Microsoft Corporation. All rights reserved. |
| | 2 | | // Licensed under the MIT License. See License.txt in the project root for |
| | 3 | | // license information. |
| | 4 | |
|
| | 5 | | namespace Microsoft.Azure.Search.Models |
| | 6 | | { |
| | 7 | | using System; |
| | 8 | | using System.Collections.Generic; |
| | 9 | | using System.Linq; |
| | 10 | | using System.Text; |
| | 11 | | using Common; |
| | 12 | |
|
| | 13 | | /// <summary> |
| | 14 | | /// Defines extension methods for the IndexingParameters class. |
| | 15 | | /// </summary> |
| | 16 | | public static class IndexingParametersExtensions |
| | 17 | | { |
| | 18 | | private const string ParsingModeKey = "parsingMode"; |
| | 19 | |
|
| | 20 | | /// <summary> |
| | 21 | | /// Specifies that the indexer will index only the blobs with the file name extensions you specify. Each string |
| | 22 | | /// leading dot. For example, ".pdf", ".docx", etc. If you pass the same file extension to this method and Exclu |
| | 23 | | /// with that extension will be excluded from indexing (that is, ExcludeFileNameExtensions takes precedence). |
| | 24 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage" /> for deta |
| | 25 | | /// </summary> |
| | 26 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 27 | | /// <param name="extensions">File extensions to include in indexing.</param> |
| | 28 | | /// <remarks> |
| | 29 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | 30 | | /// </remarks> |
| | 31 | | /// <returns>The IndexingParameters instance.</returns> |
| | 32 | | public static IndexingParameters IndexFileNameExtensions(this IndexingParameters parameters, params string[] ext |
| | 33 | | { |
| 12 | 34 | | if (extensions?.Length > 0) |
| | 35 | | { |
| 10 | 36 | | Configure( |
| 10 | 37 | | parameters, |
| 10 | 38 | | "indexedFileNameExtensions", |
| 10 | 39 | | extensions.Select(ValidateExtension).Select(FixUpExtension).ToCommaSeparatedString()); |
| | 40 | | } |
| | 41 | |
|
| 6 | 42 | | return parameters; |
| | 43 | | } |
| | 44 | |
|
| | 45 | | /// <summary> |
| | 46 | | /// Specifies that the indexer will not index blobs with the file name extensions you specify. Each string is a |
| | 47 | | /// leading dot. For example, ".pdf", ".docx", etc. If you pass the same file extension to this method and Index |
| | 48 | | /// with that extension will be excluded from indexing (that is, this method takes precedence). |
| | 49 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage" /> for deta |
| | 50 | | /// </summary> |
| | 51 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 52 | | /// <param name="extensions">File extensions to exclude from indexing.</param> |
| | 53 | | /// <remarks> |
| | 54 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | 55 | | /// </remarks> |
| | 56 | | /// <returns>The IndexingParameters instance.</returns> |
| | 57 | | public static IndexingParameters ExcludeFileNameExtensions(this IndexingParameters parameters, params string[] e |
| | 58 | | { |
| 12 | 59 | | if (extensions?.Length > 0) |
| | 60 | | { |
| 10 | 61 | | Configure( |
| 10 | 62 | | parameters, |
| 10 | 63 | | "excludedFileNameExtensions", |
| 10 | 64 | | extensions.Select(ValidateExtension).Select(FixUpExtension).ToCommaSeparatedString()); |
| | 65 | | } |
| | 66 | |
|
| 6 | 67 | | return parameters; |
| | 68 | | } |
| | 69 | |
|
| | 70 | | /// <summary> |
| | 71 | | /// Specifies which parts of a blob will be indexed by the blob storage indexer. |
| | 72 | | /// </summary> |
| | 73 | | /// <remarks> |
| | 74 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | 75 | | /// <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage" /> |
| | 76 | | /// </remarks> |
| | 77 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 78 | | /// <param name="extractionMode">A <c cref="BlobExtractionMode">BlobExtractionMode</c> value specifying what to |
| | 79 | | /// <returns>The IndexingParameters instance.</returns> |
| | 80 | | public static IndexingParameters SetBlobExtractionMode(this IndexingParameters parameters, BlobExtractionMode ex |
| 4 | 81 | | Configure(parameters, "dataToExtract", (string)extractionMode); |
| | 82 | |
|
| | 83 | | /// <summary> |
| | 84 | | /// Tells the indexer to assume that all blobs contain JSON, which it will then parse such that each blob's JSON |
| | 85 | | /// document in the search index. |
| | 86 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-index-json-blobs/" /> for details. |
| | 87 | | /// </summary> |
| | 88 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 89 | | /// <remarks> |
| | 90 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | 91 | | /// </remarks> |
| | 92 | | /// <returns>The IndexingParameters instance.</returns> |
| | 93 | | public static IndexingParameters ParseJson(this IndexingParameters parameters) => |
| 2 | 94 | | Configure(parameters, ParsingModeKey, "json"); |
| | 95 | |
|
| | 96 | | /// <summary> |
| | 97 | | /// Tells the indexer to assume that all blobs contain new-line separated JSON, which it will then parse such th |
| | 98 | | /// will map to a single document in the search index. |
| | 99 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-index-json-blobs/" /> for details. |
| | 100 | | /// </summary> |
| | 101 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 102 | | /// <remarks> |
| | 103 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | 104 | | /// </remarks> |
| | 105 | | /// <returns>The IndexingParameters instance.</returns> |
| | 106 | | public static IndexingParameters ParseJsonLines(this IndexingParameters parameters) => |
| 2 | 107 | | Configure(parameters, ParsingModeKey, "jsonLines"); |
| | 108 | |
|
| | 109 | | /// <summary> |
| | 110 | | /// Tells the indexer to assume that all blobs contain JSON arrays, which it will then parse such that each JSON |
| | 111 | | /// map to a single document in the search index. |
| | 112 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-index-json-blobs" /> for details. |
| | 113 | | /// </summary> |
| | 114 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 115 | | /// <param name="documentRoot"> |
| | 116 | | /// An optional JSON Pointer that tells the indexer how to find the JSON array if it's not the top-level JSON pr |
| | 117 | | /// parameter is null or empty, the indexer will assume that the JSON array can be found in the top-level JSON p |
| | 118 | | /// Default is null. |
| | 119 | | /// </param> |
| | 120 | | /// <remarks> |
| | 121 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | 122 | | /// </remarks> |
| | 123 | | /// <returns>The IndexingParameters instance.</returns> |
| | 124 | | public static IndexingParameters ParseJsonArrays(this IndexingParameters parameters, string documentRoot = null) |
| | 125 | | { |
| 6 | 126 | | Configure(parameters, ParsingModeKey, "jsonArray"); |
| | 127 | |
|
| 6 | 128 | | if (!string.IsNullOrEmpty(documentRoot)) |
| | 129 | | { |
| 2 | 130 | | Configure(parameters, "documentRoot", documentRoot); |
| | 131 | | } |
| | 132 | |
|
| 6 | 133 | | return parameters; |
| | 134 | | } |
| | 135 | |
|
| | 136 | | /// <summary> |
| | 137 | | /// Tells the indexer to assume that all blobs are delimited text files. Currently only comma-separated value (C |
| | 138 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-index-csv-blobs" /> for details. |
| | 139 | | /// </summary> |
| | 140 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 141 | | /// <param name="headers"> |
| | 142 | | /// Specifies column headers that the indexer will use to map values to specific fields in the search index. If |
| | 143 | | /// headers, the indexer assumes that the first non-blank line of each blob contains comma-separated headers. |
| | 144 | | /// </param> |
| | 145 | | /// <remarks> |
| | 146 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | 147 | | /// </remarks> |
| | 148 | | /// <returns>The IndexingParameters instance.</returns> |
| | 149 | | public static IndexingParameters ParseDelimitedTextFiles(this IndexingParameters parameters, params string[] hea |
| | 150 | | { |
| 4 | 151 | | Configure(parameters, ParsingModeKey, "delimitedText"); |
| | 152 | |
|
| 4 | 153 | | if (headers?.Length > 0) |
| | 154 | | { |
| 2 | 155 | | Configure(parameters, "delimitedTextHeaders", headers.ToCommaSeparatedString()); |
| | 156 | | } |
| | 157 | | else |
| | 158 | | { |
| 2 | 159 | | Configure(parameters, "firstLineContainsHeaders", true); |
| | 160 | | } |
| | 161 | |
|
| 4 | 162 | | return parameters; |
| | 163 | | } |
| | 164 | |
|
| | 165 | | /// <summary> |
| | 166 | | /// Tells the indexer to assume that blobs should be parsed as text files in UTF-8 encoding. |
| | 167 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage#indexing-pla |
| | 168 | | /// </summary> |
| | 169 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 170 | | /// <returns>The IndexingParameters instance.</returns> |
| | 171 | | public static IndexingParameters ParseText(this IndexingParameters parameters) => |
| 2 | 172 | | ParseText(parameters, Encoding.UTF8); |
| | 173 | |
|
| | 174 | | /// <summary> |
| | 175 | | /// Tells the indexer to assume that blobs should be parsed as text files in the desired encoding. |
| | 176 | | /// See <see href="https://docs.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage#indexing-pla |
| | 177 | | /// </summary> |
| | 178 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 179 | | /// <param name="encoding">Encoding used to read the text stored in blobs.</param> |
| | 180 | | /// <returns>The IndexingParameters instance.</returns> |
| | 181 | | public static IndexingParameters ParseText(this IndexingParameters parameters, Encoding encoding) |
| | 182 | | { |
| 4 | 183 | | Throw.IfArgumentNull(encoding, nameof(encoding)); |
| | 184 | |
|
| 4 | 185 | | Configure(parameters, ParsingModeKey, "text"); |
| 4 | 186 | | Configure(parameters, "encoding", encoding.WebName); |
| 4 | 187 | | return parameters; |
| | 188 | | } |
| | 189 | |
|
| | 190 | | /// <summary> |
| | 191 | | /// Specifies that <c cref="BlobExtractionMode.StorageMetadata">BlobExtractionMode.StorageMetadata</c> blob extr |
| | 192 | | /// automatically used for blobs of unsupported content types. This behavior is enabled by default. |
| | 193 | | /// </summary> |
| | 194 | | /// <remarks> |
| | 195 | | /// This option only applies to indexers that index Azure Blob Storage. |
| | 196 | | /// </remarks> |
| | 197 | | /// <param name="parameters">IndexingParameters to configure.</param> |
| | 198 | | /// <returns></returns> |
| | 199 | | /// <returns>The IndexingParameters instance.</returns> |
| | 200 | | [Obsolete("This behavior is now enabled by default, so calling this method is no longer necessary.")] |
| | 201 | | public static IndexingParameters DoNotFailOnUnsupportedContentType(this IndexingParameters parameters) => |
| 0 | 202 | | Configure(parameters, "failOnUnsupportedContentType", false); |
| | 203 | |
|
| | 204 | | private static IndexingParameters Configure(IndexingParameters parameters, string key, object value) |
| | 205 | | { |
| 40 | 206 | | Throw.IfArgumentNull(parameters, nameof(parameters)); |
| | 207 | |
|
| 40 | 208 | | if (parameters.Configuration == null) |
| | 209 | | { |
| 26 | 210 | | parameters.Configuration = new Dictionary<string, object>(); |
| | 211 | | } |
| | 212 | |
|
| 40 | 213 | | parameters.Configuration[key] = value; |
| 40 | 214 | | return parameters; |
| | 215 | | } |
| | 216 | |
|
| | 217 | | private static string ValidateExtension(string extension) |
| | 218 | | { |
| 32 | 219 | | if (string.IsNullOrEmpty(extension)) |
| | 220 | | { |
| 8 | 221 | | throw new ArgumentException("Extension cannot be null or empty string."); |
| | 222 | | } |
| | 223 | |
|
| 24 | 224 | | if (extension.Contains("*")) |
| | 225 | | { |
| 4 | 226 | | throw new ArgumentException("Extension cannot contain the wildcard character '*'."); |
| | 227 | | } |
| | 228 | |
|
| 20 | 229 | | return extension; |
| | 230 | | } |
| | 231 | |
|
| | 232 | | private static string FixUpExtension(string extension) |
| | 233 | | { |
| 20 | 234 | | if (!extension.StartsWith(".", StringComparison.Ordinal)) |
| | 235 | | { |
| 4 | 236 | | return "." + extension; |
| | 237 | | } |
| | 238 | |
|
| 16 | 239 | | return extension; |
| | 240 | | } |
| | 241 | | } |
| | 242 | | } |