Export Sitefinity's Page Content in Hawksearch V4L

In this article you will find:

Goal

This article provides information on how to export Sitefinity’s page content in Hawksearch V4L.

Prerequisite

Installed Connector -

Building a Custom Exporter

You can extend the default implementation of the DataLoader class using the IoCKernel in Global.asax on the Bootstrapper_Bootstrapped method.

 

using Hawksearch.SDK; using Hawksearch.SDK.Export; using SitefinityWebApp.Hawksearch; using System; using Telerik.Sitefinity.Abstractions; namespace SitefinityWebApp { public class Global : System.Web.HttpApplication { protected void Application_Start(object sender, EventArgs e) { Bootstrapper.Bootstrapped += this.Bootstrapper_Bootstrapped; } private void Bootstrapper_Bootstrapped(object sender, EventArgs e) { IocKernel.Kernel.Rebind<IDataLoader>().To<CustomDataLoader>(); } } }

Export Page Content

By default, the content of the page is not indexed. To export the content of the page, you need to extend the default DataLoader class and add the logic below, which is responsible for exporting the page content and striping the html from it.

  1. Add the logic for exporting page content to your custom exporter class.

using System.Collections.Generic; using System.Globalization; using Hawksearch.Export; using Telerik.Sitefinity.Model; using Telerik.Sitefinity.Pages.Model; using Telerik.Sitefinity.Web.ResourceCombining; namespace SitefinityWebApp.Search { public class CustomDataLoader : DataLoader { protected override IEnumerable<string> GetFieldValue(IDataItem item, string fieldName, CultureInfo culture = null) { if (item.GetType() == typeof(PageNode) && fieldName == "Page") { var pnItem = (PageNode)item; InMemoryPageRender renderer = new InMemoryPageRender(); var memPage = renderer.RenderPage(pnItem, false, true); var result = new List<string>(); var cleanHtml = this.StripHTML(memPage).Replace("\n", string.Empty).Trim(); cleanHtml = cleanHtml.Replace("\"", "\"\""); cleanHtml = "\"" + cleanHtml + "\""; result.Add(cleanHtml); return result; } if (item.GetType() == typeof(PageNode) && fieldName == "Description") { var pnItem = (PageNode)item; var pageData = pnItem.GetPageData(); var result = new List<string>(); if (pageData != null) { var final = this.StripHTML(pageData.Description.ToString()).Replace("\n", string.Empty).Trim(); final = final.Replace("\"", "\"\""); if (!string.IsNullOrWhiteSpace(final)) { result.Add(final); } } else { result.Add(string.Empty); } return result; } return base.GetFieldValue(item, fieldName, culture); } private string StripHTML(string source) { try { string result; // Remove HTML Development formatting // Replace line breaks with space // because browsers inserts space result = source.Replace("\r", " "); // Replace line breaks with space // because browsers inserts space result = result.Replace("\n", " "); // Remove step-formatting result = result.Replace("\t", string.Empty); // Remove repeating spaces because browsers ignore them result = System.Text.RegularExpressions.Regex.Replace(result, @"( )+", " "); // Remove the header (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*head([^>])*>", "<head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*head( )*>)", "</head>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(<head>).*(</head>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all scripts (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*script([^>])*>", "<script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*script( )*>)", "</script>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); //result = System.Text.RegularExpressions.Regex.Replace(result, // @"(<script>)([^(<script>\.</script>)])*(</script>)", // string.Empty, // System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // remove all styles (prepare first by clearing attributes) result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*style([^>])*>", "<style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"(<( )*(/)( )*style( )*>)", "</style>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(<style>).*(</style>)", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert tabs in spaces of <td> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*td([^>])*>", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line breaks in places of <BR> and <LI> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*br( )*>", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*li( )*>", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // insert line paragraphs (double line breaks) in place // if <P>, <DIV> and <TR> tags result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*div([^>])*>", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*tr([^>])*>", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"<( )*p([^>])*>", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove remaining tags like <a>, links, images, // comments etc - anything that's enclosed inside < > result = System.Text.RegularExpressions.Regex.Replace(result, @"<[^>]*>", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // replace special characters: result = System.Text.RegularExpressions.Regex.Replace(result, @" ", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&bull;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&lsaquo;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&rsaquo;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&trade;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&frasl;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&lt;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&gt;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&copy;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, @"&reg;", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove all others. More can be added, see // http://hotwired.lycos.com/webmonkey/reference/special_characters/ result = System.Text.RegularExpressions.Regex.Replace(result, @"&(.{2,6});", string.Empty, System.Text.RegularExpressions.RegexOptions.IgnoreCase); // for testing //System.Text.RegularExpressions.Regex.Replace(result, // this.txtRegex.Text,string.Empty, // System.Text.RegularExpressions.RegexOptions.IgnoreCase); // make line breaking consistent result = result.Replace("\n", " "); // Remove extra line breaks and tabs: // replace over 2 breaks with 2 and over 4 tabs with 4. // Prepare first to remove any whitespaces in between // the escaped characters and remove redundant tabs in between line breaks result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\r)", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\t)", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\t)( )+(\r)", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)( )+(\t)", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove redundant tabs result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+(\r)", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Remove multiple tabs following a line break with just one tab result = System.Text.RegularExpressions.Regex.Replace(result, "(\r)(\t)+", " ", System.Text.RegularExpressions.RegexOptions.IgnoreCase); // Initial replacement target string for line breaks string breaks = "\r\r"; // Initial replacement target string for tabs string tabs = "\t\t\t\t\t"; for (int index = 0; index < result.Length; index++) { result = result.Replace(breaks, " "); result = result.Replace(tabs, " "); } // That's it. return result; } catch { return source; } } } }

Â