Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.

In this article you will find:

...

  1. Open the backend of your Sitefinity instance

  2. Navigate to Administartion → Settings and click Advanced (your-site-domain/Sitefinity/Administration/Settings/Advanced)

  3. Open the Hawksearch configuration

  4. Under document size limit enter 4000KB

  5. Save the changes

...

Note

During indexing the files are stripped and only the text content is extracted. Some files contain a lot of metadata or embedded resources (e.g. photos) so a 10MB .pdf may only contain 1MB of actual data.

Info

Setup search service

In order to index documents above the document size limit you need to inherit the HawksearchService class and override the AdaptDocuments. Here we will demonstrate how to empty the content field or take the first 500 words in it.

Info

The following code snippet demonstrates how to strip the document from it’s Content field in order to pass the document size limit check.

...

Code Block
breakoutModewide
languagec#
using System;
using System.Collections.Generic;
using System.Linq;
using Hawksearch.Search;
using Telerik.Sitefinity.Services.Search.Data;
using Telerik.Sitefinity.Configuration;
using Telerik.Sitefinity.Services.Search.Model;
using Hawksearch.Configuration;
using Hawksearch.SDK.Indexing;

namespace SitefinityWebApp
{
    public class CustomSearchService : HawksearchService
    {
        private const string DocumentContentType = "Telerik.Sitefinity.Libraries.Model.Document";

        protected override List<SubmitDocument> AdaptDocuments(IEnumerable<IDocument> documents)
        {
            var doc = documents.ToList().FirstOrDefault();
            var documentList = new List<IDocument>(documents);

            if (doc != null)
            {
                var contentTypeField = doc.Fields.FirstOrDefault(f => f.Name == "ContentType");

                if (contentTypeField != null)
                {
                    if (string.Equals(contentTypeField.Value.ToString(), DocumentContentType, StringComparison.InvariantCultureIgnoreCase))
                    {
                        var configManager = ConfigManager.GetManager();
                        var hawkConfig = configManager.GetSection<HawkSearchConfig>();
                        documentList.Clear();

                        foreach (var document in documents)
                        {
                            var modifiedDocument = document;
                            var documentSize = this.CalculateDocumentSize(document);

                            if (documentSize > hawkConfig.DocumentSizeLimit)
                            {
                                modifiedDocument = this.ModifyDocument(document);
                            }

                            documentList.Add(modifiedDocument);
                        }
                    }
                }
            }

            return base.AdaptDocuments(documentList);
        }

        private double CalculateDocumentSize(IDocument document)
        {
            var documentSize = 0.0;

            foreach (var field in document.Fields)
            {
                if (field.Value != null)
                {
                    documentSize += System.Text.Encoding.Unicode.GetByteCount(field.Value.ToString()) / 1024.0;
                }
            }

            return documentSize;
        }

        private IDocument ModifyDocument(IDocument document)
        {
            var fields = new List<IField>(document.Fields);
            var contentField = document.Fields.FirstOrDefault(f => f.Name == "Content");

            if (contentField != null)
            {
                contentField.Value = string.Empty;
            }

            var modifiedDocument = new Document(fields, document.IdentityField.Name);

            return modifiedDocument;
        }
    }
}

Take the first 500 words

Code Block
breakoutModewide
languagec#
using System;
using System.Collections.Generic;
using System.Linq;
using Hawksearch.Search;
using Telerik.Sitefinity.Services.Search.Data;
using Telerik.Sitefinity.Configuration;
using Telerik.Sitefinity.Services.Search.Model;
using Hawksearch.Configuration;
using Hawksearch.SDK.Indexing;
using Field = Telerik.Sitefinity.Services.Search.Publishing.Field;

namespace SitefinityWebApp
{
    public class CustomSearchService : HawksearchService
    {
        private const string DocumentContentType = "Telerik.Sitefinity.Libraries.Model.Document";

        protected override List<SubmitDocument> AdaptDocuments(IEnumerable<IDocument> documents)
        {
            var doc = documents.ToList().FirstOrDefault();
            var documentList = new List<IDocument>(documents);

            if (doc != null)
            {
                var contentTypeField = doc.Fields.FirstOrDefault(f => f.Name == "ContentType");

                if (contentTypeField != null)
                {
                    if (string.Equals(contentTypeField.Value.ToString(), DocumentContentType, StringComparison.InvariantCultureIgnoreCase))
                    {
                        var configManager = ConfigManager.GetManager();
                        var hawkConfig = configManager.GetSection<HawkSearchConfig>();
                        documentList.Clear();

                        foreach (var document in documents)
                        {
                            var modifiedDocument = document;
                            var documentSize = this.CalculateDocumentSize(document);

                            if (documentSize > hawkConfig.DocumentSizeLimit)
                            {
                                modifiedDocument = this.ModifyDocument(document);
                            }

                            documentList.Add(modifiedDocument);
                        }
                    }
                }
            }

            return base.AdaptDocuments(documentList);
        }

        private double CalculateDocumentSize(IDocument document)
        {
            var documentSize = 0.0;

            foreach (var field in document.Fields)
            {
                if (field.Value != null)
                {
                    documentSize += System.Text.Encoding.Unicode.GetByteCount(field.Value.ToString()) / 1024.0;
                }
            }

            return documentSize;
        }

        private IDocument ModifyDocument(IDocument document)
        {
            var wordLimit = 500;
            var fields = new List<IField>(document.Fields);
            var contentField = document.Fields.FirstOrDefault(f => f.Name == "Content");
            fields.Remove(contentField);
            contentField = this.ExtractFieldContent(contentField, wordLimit);
            fields.Add(contentField);

            var modifiedDocument = new Document(fields, document.IdentityField.Name);

            return modifiedDocument;
        }

        private IField ExtractFieldContent(IField contentField, int wordLimit)
        {
            var fieldValue = contentField.Value.ToString();

            if (!string.IsNullOrWhiteSpace(fieldValue))
            {
                var modifiedContent = string.Join(" ", fieldValue.Split(' ').Take(wordLimit).ToArray());

                contentField = new Field
                {
                    Name = "Content",
                    Value = modifiedContent
                };
            }

            return contentField;
        }
    }
}

...