...
Der ZDB Index wird mit Hilfe der Newspaper Spark App aus dem ZDB Dump aufbaut.
...
...
...
Der Zeitungsausgabenindex bedient die folgenden Use cases:
Die primäre Datenquelle für diesen Index ist die in Cassandra gespeicherte Outbox, also das Transformationsergebnis. Dieses wird von der Newspaper-Spark-App eingelesen, aufbereitet, mit Daten aus dem ZDB Index ergänzt und in den Index geschrieben.
Dieser Index arbeitet mit Nested Documents um sowohl die Zeitungsausgaben als auch die Einzelseiten der Ausgaben zu speichern. Die Menge der im Schema definierten Felder enthält also die für die Ausgaben so wie auch die für die Einzelseiten.
HIER SCHEMA EINFÜGEN
<?xml version="1.0" encoding="UTF-8" ?> <schema name="zeitungsausgaben-index" version="1.4"> <similarity class="org.apache.lucene.search.similarities.ClassicSimilarity"/> <types> <fieldType name="text_ocr" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true"> <analyzer> <!-- Strip away the XML/HTML tags to arrive at a plaintext version of the OCR and do some other clean-up --> <charFilter class="de.digitalcollections.solrocr.formats.alto.AltoCharFilterFactory" /> <!-- rest of your analyzer chain --> <tokenizer class="solr.WhitespaceTokenizerFactory" /> <filter class="solr.LowerCaseFilterFactory" /> </analyzer> </fieldType> <fieldType name="string" class="solr.TextField" sortMissingLast="true" omitNorms="true"> <analyzer type="index"> <!--<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt" />--> <tokenizer class="solr.WhitespaceTokenizerFactory" /> <filter class="solr.LowerCaseFilterFactory" /> <filter class="solr.ShingleFilterFactory" maxShingleSize="4" outputUnigrams="true" /> </analyzer> </fieldType> <fieldType name="float" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> <fieldType name="long" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0" /> <fieldType name="issue_date" class="solr.DateRangeField" sortMissingLast="true" omitNorms="true" /> <fieldType name="nest_path" class="solr.NestPathField" /> </types> <fields> <!-- as this index is used to store nested documents the list of fields contains the fields used in the parent documents as well as the child documents. Therefore it also declares a lot of field not required even though they might be required semantically for one of the document types. --> <!-- required by Solr to be able to store nested documents --> <field name="_root_" type="string" indexed="true" stored="false"/> <!-- filled automatically by Solr --> <field name="_nest_path_" type="nest_path" stored="true" /> <field name="_nest_parent_" type="string" indexed="true" stored="true"/> <!-- used to store the ddb id of the newspaper issue or a unique identifier for the page. --> <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false"/> <!-- for issues: used to store a reference to the newspaper. --> <field name="zdb_id" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="paper_title" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="publication_day" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="publication_date" type="newspaper_date" indexed="true" stored="true" required="false" multiValued="false"/> <field name="ingest_date" type="tdate" indexed="true" stored="true" required="false" multiValued="false"/> <field name="page" type="string" indexed="true" stored="true" multiValued="true" /> <!-- for issues: the day of the week the issue was published - allows searching for e.g. all issues published on a Thursday --> <field name="page.number" type="long" indexed="true" stored="true"/> <field name="page.fulltext" type="text_ocr" indexed="true" stored="true" /> <!--<field name="fulltext.page" type="string" indexed="true" stored="true" />--> <field name="provider" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="language" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="place_of_distribution" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <field name="preview_reference" type="string" indexed="true" stored="true" required="false" multiValued="false"/> <!-- <field name="stored_field" type="string" indexed="false" stored="true" multiValued="true"/> <field name="indexed_field" type="string" indexed="true" stored="false" multiValued="true"/> <field name="stored_and_indexed_field" type="string" indexed="true" stored="true" multiValued="true" /> <field name="timestamp" type="ddbdate" indexed="true" stored="true" multiValued="true"/> <field name="docval" type="float" indexed="false" stored="false" multiValued="false" docValues="true"/> <field name="facet_docval" type="keyword" indexed="false" stored="false" multiValued="false" docValues="true"/> <field name="facet_field" type="keyword" indexed="false" stored="false" multiValued="true" docValues="true"/> --> <field name="_version_" type="long" indexed="false" stored="false" multiValued="false" docValues="true" /> <!--<field name="ocr_text" type="text_ocr" indexed="true" stored="true" />--> </fields> <!-- Field to use to determine and enforce document uniqueness. Unless this field is marked with required="false", it will be a required field --> <uniqueKey>id</uniqueKey> </schema> |
cc
Besonderheiten
Dieser Index benutzt das SOLR-OCR-Plugin des MDZ um die Volltexte und inkl. der Wortpositionen aus den gelieferten ALTO Dateien zu indexieren.
...