...
Code Block | ||||
---|---|---|---|---|
| ||||
<?xml version="1.0" encoding="UTF-8" ?> <schema name="zeitungsausgaben-index" version="1.4"> <similarity class="org.apache.lucene.search.similarities.ClassicSimilarity"/> <types> <fieldType name="text_ocr" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true"> <analyzer> <!-- Strip away the XML/HTML tags to arrive at a plaintext version of the OCR and do some other clean-up --> <charFilter class="de.digitalcollections.solrocr.formats.alto.AltoCharFilterFactory" /> <!-- rest of your analyzer chain --> <tokenizer class="solr.WhitespaceTokenizerFactory" /> <filter class="solr.LowerCaseFilterFactory" /> </analyzer> </fieldType> <fieldType name="string" class="solr.TextField" sortMissingLast="true" omitNorms="true"> <analyzer type="index"> <!--<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt" />--> <tokenizer class="solr.WhitespaceTokenizerFactory" /> <filter class="solr.LowerCaseFilterFactory" /> <filter class="solr.ShingleFilterFactory" maxShingleSize="4" outputUnigrams="true" /> </analyzer> </fieldType> <fieldType name="float" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> <fieldType name="long" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0" /> <fieldType name="issue_date" class="solr.DateRangeField" sortMissingLast="true" omitNorms="true" /> <fieldType name="nest_path" class="solr.NestPathField" /> </types> <fields> <!-- as this index is used to store nested documents the list of fields contains the fields used in the parent documents as well as the child documents. Therefore it also declares a lot of field not required even though they might be required semantically for one of the document types. --> <!-- required by Solr to be able to store nested documents --> <field name="_root_" type="string" indexed="true" stored="false"/> <!-- filled automatically by Solr --> <field name="_nest_path_" type="nest_path" stored="true" /> <field name="_nest_parent_" type="string" indexed="true" stored="true"/> <!-- for issues: used to store the ddb id of the newspaper issue for pages: used to store a unique identifier for the page. --> <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false"/> <!-- for issues: used to store a reference to the newspaper. --> <field name="zdb_id" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <!-- for issues: used to store the title of the newspaper --> <field name="paper_title" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <!-- for issues: the day of the week the issue was published - allows searching for e.g. all issues published on a Thursday --> <field name="publication_day" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <!-- for issues: the date the issue was published --> <field name="publication_date" type="newspaper_date" indexed="true" stored="true" required="false" multiValued="false"/> <!-- for issues: the year the issue was published - required for facet search --> <field name="publication_date" type="newspaper_date" indexed="true" stored="true" required="false" multiValued="false"/> <!-- for issues: the date the issue was ingested --> <field name="ingest_date" type="tdate" indexed="true" stored="true" required="false" multiValued="false"/> <field name="page" type="string" indexed="true" stored="true" multiValued="true" /> <field name="page.number" type="long" indexed="true" stored="true"/> <field name="page.fulltext" type="text_ocr" indexed="true" stored="true" /> <!--<field name="fulltext.page" type="string" indexed="true" stored="true" />--> <field name="provider" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="language" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="place_of_distribution" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="preview_reference" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="_version_" type="long" indexed="false" stored="false" multiValued="false" docValues="true" /> <!--<field name="ocr_text" type="text_ocr" indexed="true" stored="true" />--> </fields> <!-- Field to use to determine and enforce document uniqueness. Unless this field is marked with required="false", it will be a required field --> <uniqueKey>id</uniqueKey> </schema> |
Besonderheiten
Dieser Index benutzt das SOLR-OCR-Plugin des MDZ um die Volltexte und inkl. der Wortpositionen aus den gelieferten ALTO Dateien zu indexieren.
...