MicrosoftLanguageStemmingTokenizer.java

  1. // Copyright (c) Microsoft Corporation. All rights reserved.
  2. // Licensed under the MIT License.

  3. package com.azure.search.documents.indexes.models;

  4. import com.azure.core.annotation.Fluent;
  5. import com.fasterxml.jackson.annotation.JsonProperty;
  6. import com.fasterxml.jackson.annotation.JsonTypeInfo;
  7. import com.fasterxml.jackson.annotation.JsonTypeName;

  8. /**
  9.  * Divides text using language-specific rules and reduces words to their base
  10.  * forms.
  11.  */
  12. @JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "@odata.type")
  13. @JsonTypeName("#Microsoft.Azure.Search.MicrosoftLanguageStemmingTokenizer")
  14. @Fluent
  15. public final class MicrosoftLanguageStemmingTokenizer extends LexicalTokenizer {
  16.     /*
  17.      * The maximum token length. Tokens longer than the maximum length are
  18.      * split. Maximum token length that can be used is 300 characters. Tokens
  19.      * longer than 300 characters are first split into tokens of length 300 and
  20.      * then each of those tokens is split based on the max token length set.
  21.      * Default is 255.
  22.      */
  23.     @JsonProperty(value = "maxTokenLength")
  24.     private Integer maxTokenLength;

  25.     /*
  26.      * A value indicating how the tokenizer is used. Set to true if used as the
  27.      * search tokenizer, set to false if used as the indexing tokenizer.
  28.      * Default is false.
  29.      */
  30.     @JsonProperty(value = "isSearchTokenizer")
  31.     private Boolean isSearchTokenizerUsed;

  32.     /*
  33.      * The language to use. The default is English. Possible values include:
  34.      * 'Arabic', 'Bangla', 'Bulgarian', 'Catalan', 'Croatian', 'Czech',
  35.      * 'Danish', 'Dutch', 'English', 'Estonian', 'Finnish', 'French', 'German',
  36.      * 'Greek', 'Gujarati', 'Hebrew', 'Hindi', 'Hungarian', 'Icelandic',
  37.      * 'Indonesian', 'Italian', 'Kannada', 'Latvian', 'Lithuanian', 'Malay',
  38.      * 'Malayalam', 'Marathi', 'NorwegianBokmaal', 'Polish', 'Portuguese',
  39.      * 'PortugueseBrazilian', 'Punjabi', 'Romanian', 'Russian',
  40.      * 'SerbianCyrillic', 'SerbianLatin', 'Slovak', 'Slovenian', 'Spanish',
  41.      * 'Swedish', 'Tamil', 'Telugu', 'Turkish', 'Ukrainian', 'Urdu'
  42.      */
  43.     @JsonProperty(value = "language")
  44.     private MicrosoftStemmingTokenizerLanguage language;

  45.     /**
  46.      * Constructor of {@link MicrosoftLanguageStemmingTokenizer}.
  47.      *
  48.      * @param name The name of the tokenizer. It must only contain letters, digits, spaces,
  49.      * dashes or underscores, can only start and end with alphanumeric
  50.      * characters, and is limited to 128 characters.
  51.      */
  52.     public MicrosoftLanguageStemmingTokenizer(String name) {
  53.         super(name);
  54.     }

  55.     /**
  56.      * Get the maxTokenLength property: The maximum token length. Tokens longer
  57.      * than the maximum length are split. Maximum token length that can be used
  58.      * is 300 characters. Tokens longer than 300 characters are first split
  59.      * into tokens of length 300 and then each of those tokens is split based
  60.      * on the max token length set. Default is 255.
  61.      *
  62.      * @return the maxTokenLength value.
  63.      */
  64.     public Integer getMaxTokenLength() {
  65.         return this.maxTokenLength;
  66.     }

  67.     /**
  68.      * Set the maxTokenLength property: The maximum token length. Tokens longer
  69.      * than the maximum length are split. Maximum token length that can be used
  70.      * is 300 characters. Tokens longer than 300 characters are first split
  71.      * into tokens of length 300 and then each of those tokens is split based
  72.      * on the max token length set. Default is 255.
  73.      *
  74.      * @param maxTokenLength the maxTokenLength value to set.
  75.      * @return the MicrosoftLanguageStemmingTokenizer object itself.
  76.      */
  77.     public MicrosoftLanguageStemmingTokenizer setMaxTokenLength(Integer maxTokenLength) {
  78.         this.maxTokenLength = maxTokenLength;
  79.         return this;
  80.     }

  81.     /**
  82.      * Get the isSearchTokenizer property: A value indicating how the tokenizer
  83.      * is used. Set to true if used as the search tokenizer, set to false if
  84.      * used as the indexing tokenizer. Default is false.
  85.      *
  86.      * @return the isSearchTokenizer value.
  87.      */
  88.     public Boolean isSearchTokenizer() {
  89.         return this.isSearchTokenizerUsed;
  90.     }

  91.     /**
  92.      * Set the isSearchTokenizer property: A value indicating how the tokenizer
  93.      * is used. Set to true if used as the search tokenizer, set to false if
  94.      * used as the indexing tokenizer. Default is false.
  95.      *
  96.      * @param isSearchTokenizerUsed the isSearchTokenizer value to set.
  97.      * @return the MicrosoftLanguageStemmingTokenizer object itself.
  98.      */
  99.     public MicrosoftLanguageStemmingTokenizer setIsSearchTokenizerUsed(Boolean isSearchTokenizerUsed) {
  100.         this.isSearchTokenizerUsed = isSearchTokenizerUsed;
  101.         return this;
  102.     }

  103.     /**
  104.      * Get the language property: The language to use. The default is English.
  105.      * Possible values include: 'Arabic', 'Bangla', 'Bulgarian', 'Catalan',
  106.      * 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'Estonian',
  107.      * 'Finnish', 'French', 'German', 'Greek', 'Gujarati', 'Hebrew', 'Hindi',
  108.      * 'Hungarian', 'Icelandic', 'Indonesian', 'Italian', 'Kannada', 'Latvian',
  109.      * 'Lithuanian', 'Malay', 'Malayalam', 'Marathi', 'NorwegianBokmaal',
  110.      * 'Polish', 'Portuguese', 'PortugueseBrazilian', 'Punjabi', 'Romanian',
  111.      * 'Russian', 'SerbianCyrillic', 'SerbianLatin', 'Slovak', 'Slovenian',
  112.      * 'Spanish', 'Swedish', 'Tamil', 'Telugu', 'Turkish', 'Ukrainian', 'Urdu'.
  113.      *
  114.      * @return the language value.
  115.      */
  116.     public MicrosoftStemmingTokenizerLanguage getLanguage() {
  117.         return this.language;
  118.     }

  119.     /**
  120.      * Set the language property: The language to use. The default is English.
  121.      * Possible values include: 'Arabic', 'Bangla', 'Bulgarian', 'Catalan',
  122.      * 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'Estonian',
  123.      * 'Finnish', 'French', 'German', 'Greek', 'Gujarati', 'Hebrew', 'Hindi',
  124.      * 'Hungarian', 'Icelandic', 'Indonesian', 'Italian', 'Kannada', 'Latvian',
  125.      * 'Lithuanian', 'Malay', 'Malayalam', 'Marathi', 'NorwegianBokmaal',
  126.      * 'Polish', 'Portuguese', 'PortugueseBrazilian', 'Punjabi', 'Romanian',
  127.      * 'Russian', 'SerbianCyrillic', 'SerbianLatin', 'Slovak', 'Slovenian',
  128.      * 'Spanish', 'Swedish', 'Tamil', 'Telugu', 'Turkish', 'Ukrainian', 'Urdu'.
  129.      *
  130.      * @param language the language value to set.
  131.      * @return the MicrosoftLanguageStemmingTokenizer object itself.
  132.      */
  133.     public MicrosoftLanguageStemmingTokenizer setLanguage(MicrosoftStemmingTokenizerLanguage language) {
  134.         this.language = language;
  135.         return this;
  136.     }
  137. }