Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

...

Code Block
languagexml
titleZeitungsausgaben-Index schema.xml
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="zeitungsausgaben-index" version="1.4">
   
   <similarity class="org.apache.lucene.search.similarities.ClassicSimilarity"/>
   
   <types>

     <fieldType name="text_ocr" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true">
       <analyzer>
           <!-- Strip away the XML/HTML tags to arrive at a plaintext version of the OCR and do some other clean-up -->
           <charFilter class="de.digitalcollections.solrocr.formats.alto.AltoCharFilterFactory" />
           <!-- rest of your analyzer chain -->
           <tokenizer class="solr.WhitespaceTokenizerFactory" />
           <filter class="solr.LowerCaseFilterFactory" />
       </analyzer>
     </fieldType>   
     <fieldType name="string" class="solr.TextField" sortMissingLast="true" omitNorms="true">
       <analyzer type="index">
         <!--<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt" />-->
         <tokenizer class="solr.WhitespaceTokenizerFactory" />
         <filter class="solr.LowerCaseFilterFactory" />
         <filter class="solr.ShingleFilterFactory" maxShingleSize="4" outputUnigrams="true" />
       </analyzer>
     </fieldType>
     <fieldType name="float" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="long" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0" />
     <fieldType name="issue_date" class="solr.DateRangeField" sortMissingLast="true" omitNorms="true" /> 
     <fieldType name="nest_path" class="solr.NestPathField" />
   </types>

   <fields>
      <!-- as this index is used to store nested documents the list of fields contains the fields used in the parent documents as well as the child documents. 
           Therefore it also declares a lot of field not required even though they might be required semantically for one of the document types. -->
 
      <!-- required by Solr to be able to store nested documents -->	
      <field name="_root_" type="string" indexed="true" stored="false"/> 
 
      <!-- filled automatically by Solr -->
      <field name="_nest_path_" type="nest_path" stored="true" />
      <field name="_nest_parent_" type="string" indexed="true" stored="true"/>

      <!-- for issues: used to store the ddb id of the newspaper issue
           for pages: used to store a unique identifier for the page. -->
      <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false"/>
 
      <!-- for issues: used to store a reference to the newspaper. -->
      <field name="zdb_id" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
 
	  <!-- for issues: used to store the title of the newspaper -->    
      <field name="paper_title" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
      
      <!-- for issues: the day of the week the issue was published - allows searching for e.g. all issues published on a Thursday --> 
      <field name="publication_day" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
 
      <!-- for issues: the date the issue was published --> 
      <field name="publication_date" type="newspaper_date" indexed="true" stored="true" required="false" multiValued="false"/>
 
      <!-- for issues: the year the issue was published - required for facet search -->
      <field name="publication_date" type="newspaper_date" indexed="true" stored="true" required="false" multiValued="false"/>
 
      <!-- for issues: the date the issue was ingested --> 
      <field name="ingest_date" type="tdate" indexed="true" stored="true" required="false" multiValued="false"/>

      
      <field name="page" type="string" indexed="true" stored="true" multiValued="true" />
      <field name="page.number" type="long" indexed="true" stored="true"/>
      <field name="page.fulltext" type="text_ocr" indexed="true" stored="true" />

      <!--<field name="fulltext.page" type="string" indexed="true" stored="true" />-->
      <field name="provider" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
      <field name="language" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
      <field name="place_of_distribution" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
      <field name="preview_reference" type="string" indexed="true" stored="true" required="false" multiValued="false"/>

      <field name="_version_" type="long" indexed="false" stored="false" multiValued="false" docValues="true" />
      <!--<field name="ocr_text" type="text_ocr" indexed="true" stored="true" />-->
   </fields>

   <!-- Field to use to determine and enforce document uniqueness.
      Unless this field is marked with required="false", it will be a required field
   -->
   <uniqueKey>id</uniqueKey>

</schema>


 
 
 
 

 

Besonderheiten

Dieser Index benutzt das SOLR-OCR-Plugin des MDZ um die Volltexte und inkl. der Wortpositionen aus den gelieferten ALTO Dateien zu indexieren. 

...