...
Code Block | ||||
---|---|---|---|---|
| ||||
<?xml version="1.0" encoding="UTF-8" ?>
<schema name="zeitungsausgaben-index" version="1.4">
<similarity class="org.apache.lucene.search.similarities.ClassicSimilarity"/>
<types>
<fieldType name="text_ocr" class="solr.TextField" storeOffsetsWithPositions="true" termVectors="true">
<analyzer>
<!-- Strip away the XML/HTML tags to arrive at a plaintext version of the OCR and do some other clean-up -->
<charFilter class="de.digitalcollections.solrocr.formats.alto.AltoCharFilterFactory" />
<!-- rest of your analyzer chain -->
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<fieldType name="string" class="solr.TextField" sortMissingLast="true" omitNorms="true">
<analyzer type="index">
<!--<charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt" />-->
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.ShingleFilterFactory" maxShingleSize="4" outputUnigrams="true" />
</analyzer>
</fieldType>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" docValues="true" precisionStep="0" positionIncrementGap="0" />
<fieldType name="issue_date" class="solr.DateRangeField" sortMissingLast="true" omitNorms="true" />
<fieldType name="nest_path" class="solr.NestPathField" />
</types>
<fields>
<!-- as this index is used to store nested documents the list of fields contains the fields used in the parent documents as well as the child documents.
Therefore it also declares a lot of field not required even though they might be required semantically for one of the document types. -->
<!-- required by Solr to be able to store nested documents -->
<field name="_root_" type="string" indexed="true" stored="false"/>
<!-- filled automatically by Solr -->
<field name="_nest_path_" type="nest_path" stored="true" />
<field name="_nest_parent_" type="string" indexed="true" stored="true"/>
<!-- for issues: used to store the ddb id of the newspaper issue
for pages: used to store a unique identifier for the page. -->
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false"/>
<!-- for issues: used to store a reference to the newspaper. -->
<field name="zdb_id" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
<!-- for issues: used to store the title of the newspaper -->
<field name="paper_title" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
<!-- for issues: the day of the week the issue was published - allows searching for e.g. all issues published on a Thursday -->
<field name="publication_day" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
<!-- for issues: the date the issue was published -->
<field name="publication_date" type="newspaper_date" indexed="true" stored="true" required="false" multiValued="false"/>
<!-- for issues: the year the issue was published - required for facet search -->
<field name="publication_date" type="newspaper_date" indexed="true" stored="true" required="false" multiValued="false"/>
<!-- for issues: the date the issue was ingested -->
<field name="ingest_date" type="tdate" indexed="true" stored="true" required="false" multiValued="false"/>
<field name="page" type="string" indexed="true" stored="true" multiValued="true" />
<field name="page.number" type="long" indexed="true" stored="true"/>
<field name="page.fulltext" type="text_ocr" indexed="true" stored="true" />
<!--<field name="fulltext.page" type="string" indexed="true" stored="true" />-->
<field name="provider" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
<field name="language" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
<field name="place_of_distribution" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
<field name="preview_reference" type="string" indexed="true" stored="true" required="false" multiValued="false"/>
<field name="_version_" type="long" indexed="false" stored="false" multiValued="false" docValues="true" />
<!--<field name="ocr_text" type="text_ocr" indexed="true" stored="true" />-->
</fields>
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field
-->
<uniqueKey>id</uniqueKey>
</schema>
|
cc
Besonderheiten
Dieser Index benutzt das SOLR-OCR-Plugin des MDZ um die Volltexte und inkl. der Wortpositionen aus den gelieferten ALTO Dateien zu indexieren.
Queries zum Bedienen der Anforderungen
- Volltext Suche innerhalb einer Ausgabe: /select?hl.ocr.fl=page.fulltext&hl=true&fl=id,page.number&q=+{!child%20of=%27id:4175160268881036%27}%20+page.fulltext:%22Fernsprecher%22