How to Search, Modify & Replace All Hyperlinks in a Word Document using .NET
This technical tip explains how .NET developers can find and modify all hyperlinks in a Word document. To find and modify hyperlinks it would be nice to have some sort of Hyperlink object with properties, but in the current version, there is no built-in functionality in Aspose.Words to deal with hyperlink fields. Hyperlinks in Microsoft Word documents are fields. A field consists of the field code and field result. In the current version of Aspose.Words, there is no single object that represents a field. Aspose.Words represents a field by a set of nodes: [FieldStart] , one or more [Run] nodes of the field code, [FieldSeparator] , one or more Run nodes of the field result and [FieldEnd]. While Aspose.Words does not have a high-level abstraction to represent fields and hyperlink fields in particular, all of the necessary low-level document elements and their properties are exposed and with a bit of coding you can implement quite sophisticated document manipulation features. This example shows how to create a simple class that represents a hyperlink in the document.
//your code here...Code samples for Find and modify all Hyperlinks in a Word document
//[C# Code Sample]
using System;
using System.Text;
using System.Text.RegularExpressions;
using Aspose.Words;
using Aspose.Words.Fields;
namespace Examples
{
/// <summary>
/// Shows how to replace hyperlinks in a Word document.
/// </summary>
public class ExReplaceHyperlinks : ExBase
{
/// <summary>
/// Finds all hyperlinks in a Word document and changes their URL and display name.
/// </summary>
public void ReplaceHyperlinks()
{
// Specify your document name here.
Document doc = new Document(MyDir + "ReplaceHyperlinks.doc");
// Hyperlinks in a Word documents are fields, select all field start nodes so we can find the hyperlinks.
NodeList fieldStarts = doc.SelectNodes("//FieldStart");
foreach (FieldStart fieldStart in fieldStarts)
{
if (fieldStart.FieldType.Equals(FieldType.FieldHyperlink))
{
// The field is a hyperlink field, use the "facade" class to help to deal with the field.
Hyperlink hyperlink = new Hyperlink(fieldStart);
// Some hyperlinks can be local (links to bookmarks inside the document), ignore these.
if (hyperlink.IsLocal)
continue;
// The Hyperlink class allows to set the target URL and the display name
// of the link easily by setting the properties.
hyperlink.Target = NewUrl;
hyperlink.Name = NewName;
}
}
doc.Save(MyDir + "ReplaceHyperlinks Out.doc");
}
private const string NewUrl = @"http://www.aspose.com";
private const string NewName = "Aspose - The .NET & Java Component Publisher";
}
/// <summary>
/// This "facade" class makes it easier to work with a hyperlink field in a Word document.
///
/// A hyperlink is represented by a HYPERLINK field in a Word document. A field in Aspose.Words
/// consists of several nodes and it might be difficult to work with all those nodes directly.
/// Note this is a simple implementation and will work only if the hyperlink code and name
/// each consist of one Run only.
///
/// [FieldStart][Run - field code][FieldSeparator][Run - field result][FieldEnd]
///
/// The field code contains a string in one of these formats:
/// HYPERLINK "url"
/// HYPERLINK \l "bookmark name"
///
/// The field result contains text that is displayed to the user.
/// </summary>
internal class Hyperlink
{
internal Hyperlink(FieldStart fieldStart)
{
if (fieldStart == null)
throw new ArgumentNullException("fieldStart");
if (!fieldStart.FieldType.Equals(FieldType.FieldHyperlink))
throw new ArgumentException("Field start type must be FieldHyperlink.");
mFieldStart = fieldStart;
// Find the field separator node.
mFieldSeparator = FindNextSibling(mFieldStart, NodeType.FieldSeparator);
if (mFieldSeparator == null)
throw new InvalidOperationException("Cannot find field separator.");
// Find the field end node. Normally field end will always be found, but in the example document
// there happens to be a paragraph break included in the hyperlink and this puts the field end
// in the next paragraph. It will be much more complicated to handle fields which span several
// paragraphs correctly, but in this case allowing field end to be null is enough for our purposes.
mFieldEnd = FindNextSibling(mFieldSeparator, NodeType.FieldEnd);
// Field code looks something like [ HYPERLINK "http:\\www.myurl.com" ], but it can consist of several runs.
string fieldCode = GetTextSameParent(mFieldStart.NextSibling, mFieldSeparator);
Match match = gRegex.Match(fieldCode.Trim());
mIsLocal = (match.Groups[1].Length > 0); //The link is local if \l is present in the field code.
mTarget = match.Groups[2].Value;
}
/// <summary>
/// Gets or sets the display name of the hyperlink.
/// </summary>
internal string Name
{
get
{
return GetTextSameParent(mFieldSeparator, mFieldEnd);
}
set
{
// Hyperlink display name is stored in the field result which is a Run
// node between field separator and field end.
Run fieldResult = (Run)mFieldSeparator.NextSibling;
fieldResult.Text = value;
// But sometimes the field result can consist of more than one run, delete these runs.
RemoveSameParent(fieldResult.NextSibling, mFieldEnd);
}
}
/// <summary>
/// Gets or sets the target url or bookmark name of the hyperlink.
/// </summary>
internal string Target
{
get
{
string dummy = null; // This is needed to fool the C# to VB.NET converter.
return mTarget;
}
set
{
mTarget = value;
UpdateFieldCode();
}
}
/// <summary>
/// True if the hyperlink's target is a bookmark inside the document. False if the hyperlink is a url.
/// </summary>
internal bool IsLocal
{
get
{
return mIsLocal;
}
set
{
mIsLocal = value;
UpdateFieldCode();
}
}
private void UpdateFieldCode()
{
// Field code is stored in a Run node between field start and field separator.
Run fieldCode = (Run)mFieldStart.NextSibling;
fieldCode.Text = string.Format("HYPERLINK {0}\"{1}\"", ((mIsLocal) ? "\\l " : ""), mTarget);
// But sometimes the field code can consist of more than one run, delete these runs.
RemoveSameParent(fieldCode.NextSibling, mFieldSeparator);
}
/// <summary>
/// Goes through siblings starting from the start node until it finds a node of the specified type or null.
/// </summary>
private static Node FindNextSibling(Node startNode, NodeType nodeType)
{
for (Node node = startNode; node != null; node = node.NextSibling)
{
if (node.NodeType.Equals(nodeType))
return node;
}
return null;
}
/// <summary>
/// Retrieves text from start up to but not including the end node.
/// </summary>
private static string GetTextSameParent(Node startNode, Node endNode)
{
if ((endNode != null) && (startNode.ParentNode != endNode.ParentNode))
throw new ArgumentException("Start and end nodes are expected to have the same parent.");
StringBuilder builder = new StringBuilder();
for (Node child = startNode; !child.Equals(endNode); child = child.NextSibling)
builder.Append(child.GetText());
return builder.ToString();
}
/// <summary>
/// Removes nodes from start up to but not including the end node.
/// Start and end are assumed to have the same parent.
/// </summary>
private static void RemoveSameParent(Node startNode, Node endNode)
{
if ((endNode != null) && (startNode.ParentNode != endNode.ParentNode))
throw new ArgumentException("Start and end nodes are expected to have the same parent.");
Node curChild = startNode;
while ((curChild != null) && (curChild != endNode))
{
Node nextChild = curChild.NextSibling;
curChild.Remove();
curChild = nextChild;
}
}
private readonly Node mFieldStart;
private readonly Node mFieldSeparator;
private readonly Node mFieldEnd;
private bool mIsLocal;
private string mTarget;
/// <summary>
/// RK I am notoriously bad at regexes. It seems I don't understand their way of thinking.
/// </summary>
private static readonly Regex gRegex = new Regex(
"\\S+" + // one or more non spaces HYPERLINK or other word in other languages
"\\s+" + // one or more spaces
"(?:\"\"\\s+)?" + // non capturing optional "" and one or more spaces, found in one of the customers files.
"(\\\\l\\s+)?" + // optional \l flag followed by one or more spaces
"\"" + // one apostrophe
"([^\"]+)" + // one or more chars except apostrophe (hyperlink target)
"\"" // one closing apostrophe
);
}
}
// [Visual Basic Code Sample]
Imports Microsoft.VisualBasic
Imports System
Imports System.Text
Imports System.Text.RegularExpressions
Imports Aspose.Words
Imports Aspose.Words.Fields
Namespace Examples
''' <summary>
''' Shows how to replace hyperlinks in a Word document.
''' </summary>
<TestFixture> _
Public Class ExReplaceHyperlinks
Inherits ExBase
''' <summary>
''' Finds all hyperlinks
Url: http://www.aspose.com/.net/word-component.aspx
Language: C# | User: Sheraz Khan | Created: Apr 15, 2015 | Tags: Search Hyperlinks in MS Word Modify Hyperlinks in Word File Replace All Hyperlinks in Word Document create word documents .NET word processing .NET API for MS Word