/
Export Sitefinity's Page Content in Hawksearch V4L
Export Sitefinity's Page Content in Hawksearch V4L
In this article you will find:
Goal
This article provides information on how to export Sitefinity’s page content in Hawksearch V4L.
Prerequisite
Installed Connector - Installing the Connector
Building a Custom Exporter
You can extend the default implementation of the DataLoader class using the IoCKernel in Global.asax on the Bootstrapper_Bootstrapped method.
using Hawksearch.SDK;
using Hawksearch.SDK.Export;
using SitefinityWebApp.Hawksearch;
using System;
using Telerik.Sitefinity.Abstractions;
namespace SitefinityWebApp
{
public class Global : System.Web.HttpApplication
{
protected void Application_Start(object sender, EventArgs e)
{
Bootstrapper.Bootstrapped += this.Bootstrapper_Bootstrapped;
}
private void Bootstrapper_Bootstrapped(object sender, EventArgs e)
{
IocKernel.Kernel.Rebind<IDataLoader>().To<CustomDataLoader>();
}
}
}
Export Page Content
By default, the content of the page is not indexed. To export the content of the page, you need to extend the default DataLoader class and add the logic below, which is responsible for exporting the page content and striping the html from it.
Add the logic for exporting page content to your custom exporter class.
using System.Collections.Generic;
using System.Globalization;
using Hawksearch.Export;
using Telerik.Sitefinity.Model;
using Telerik.Sitefinity.Pages.Model;
using Telerik.Sitefinity.Web.ResourceCombining;
namespace SitefinityWebApp.Search
{
public class CustomDataLoader : DataLoader
{
protected override IEnumerable<string> GetFieldValue(IDataItem item, string fieldName, CultureInfo culture = null)
{
if (item.GetType() == typeof(PageNode) && fieldName == "Page")
{
var pnItem = (PageNode)item;
InMemoryPageRender renderer = new InMemoryPageRender();
var memPage = renderer.RenderPage(pnItem, false, true);
var result = new List<string>();
var cleanHtml = this.StripHTML(memPage).Replace("\n", string.Empty).Trim();
cleanHtml = cleanHtml.Replace("\"", "\"\"");
cleanHtml = "\"" + cleanHtml + "\"";
result.Add(cleanHtml);
return result;
}
if (item.GetType() == typeof(PageNode) && fieldName == "Description")
{
var pnItem = (PageNode)item;
var pageData = pnItem.GetPageData();
var result = new List<string>();
if (pageData != null)
{
var final = this.StripHTML(pageData.Description.ToString()).Replace("\n", string.Empty).Trim();
final = final.Replace("\"", "\"\"");
if (!string.IsNullOrWhiteSpace(final))
{
result.Add(final);
}
}
else
{
result.Add(string.Empty);
}
return result;
}
return base.GetFieldValue(item, fieldName, culture);
}
private string StripHTML(string source)
{
try
{
string result;
// Remove HTML Development formatting
// Replace line breaks with space
// because browsers inserts space
result = source.Replace("\r", " ");
// Replace line breaks with space
// because browsers inserts space
result = result.Replace("\n", " ");
// Remove step-formatting
result = result.Replace("\t", string.Empty);
// Remove repeating spaces because browsers ignore them
result = System.Text.RegularExpressions.Regex.Replace(result,
@"( )+", " ");
// Remove the header (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*head([^>])*>", "<head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*head( )*>)", "</head>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<head>).*(</head>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all scripts (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*script([^>])*>", "<script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*script( )*>)", "</script>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
//result = System.Text.RegularExpressions.Regex.Replace(result,
// @"(<script>)([^(<script>\.</script>)])*(</script>)",
// string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<script>).*(</script>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// remove all styles (prepare first by clearing attributes)
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*style([^>])*>", "<style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"(<( )*(/)( )*style( )*>)", "</style>",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(<style>).*(</style>)", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert tabs in spaces of <td> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*td([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line breaks in places of <BR> and <LI> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*br( )*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*li( )*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// insert line paragraphs (double line breaks) in place
// if <P>, <DIV> and <TR> tags
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*div([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*tr([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<( )*p([^>])*>", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove remaining tags like <a>, links, images,
// comments etc - anything that's enclosed inside < >
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<[^>]*>", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// replace special characters:
result = System.Text.RegularExpressions.Regex.Replace(result,
@" ", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"•", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"‹", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"›", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"™", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"⁄", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"<", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@">", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"©", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
@"®", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove all others. More can be added, see
// http://hotwired.lycos.com/webmonkey/reference/special_characters/
result = System.Text.RegularExpressions.Regex.Replace(result,
@"&(.{2,6});", string.Empty,
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// for testing
//System.Text.RegularExpressions.Regex.Replace(result,
// this.txtRegex.Text,string.Empty,
// System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// make line breaking consistent
result = result.Replace("\n", " ");
// Remove extra line breaks and tabs:
// replace over 2 breaks with 2 and over 4 tabs with 4.
// Prepare first to remove any whitespaces in between
// the escaped characters and remove redundant tabs in between line breaks
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+(\r)", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\t)", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\t)( )+(\r)", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)( )+(\t)", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove redundant tabs
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+(\r)", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Remove multiple tabs following a line break with just one tab
result = System.Text.RegularExpressions.Regex.Replace(result,
"(\r)(\t)+", " ",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
// Initial replacement target string for line breaks
string breaks = "\r\r";
// Initial replacement target string for tabs
string tabs = "\t\t\t\t\t";
for (int index = 0; index < result.Length; index++)
{
result = result.Replace(breaks, " ");
result = result.Replace(tabs, " ");
}
// That's it.
return result;
}
catch
{
return source;
}
}
}
}