indra/llui/llurlregistry.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310

/** 
 * @file llurlregistry.cpp
 * @author Martin Reddy
 * @brief Contains a set of Url types that can be matched in a string
 *
 * $LicenseInfo:firstyear=2009&license=viewerlgpl$
 * Second Life Viewer Source Code
 * Copyright (C) 2010, Linden Research, Inc.
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation;
 * version 2.1 of the License only.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 * 
 * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
 * $/LicenseInfo$
 */

#include "linden_common.h"
#include "llurlregistry.h"
#include "lluriparser.h"

#include <boost/regex.hpp>

// default dummy callback that ignores any label updates from the server
void LLUrlRegistryNullCallback(const std::string &url, const std::string &label, const std::string& icon)
{
}

LLUrlRegistry::LLUrlRegistry()
{
	mUrlEntry.reserve(20);

	// Urls are matched in the order that they were registered
	registerUrl(new LLUrlEntryNoLink());
	mUrlEntryIcon = new LLUrlEntryIcon();
	registerUrl(mUrlEntryIcon);
	mLLUrlEntryInvalidSLURL = new LLUrlEntryInvalidSLURL();
	registerUrl(mLLUrlEntryInvalidSLURL);
	registerUrl(new LLUrlEntrySLURL());

	// decorated links for host names like: secondlife.com and lindenlab.com
	registerUrl(new LLUrlEntrySecondlifeURL());
	registerUrl(new LLUrlEntrySimpleSecondlifeURL());

	registerUrl(new LLUrlEntryHTTP());
	mUrlEntryHTTPLabel = new LLUrlEntryHTTPLabel();
	registerUrl(mUrlEntryHTTPLabel);
	registerUrl(new LLUrlEntryAgentCompleteName());
	registerUrl(new LLUrlEntryAgentDisplayName());
	registerUrl(new LLUrlEntryAgentUserName());
	// LLUrlEntryAgent*Name must appear before LLUrlEntryAgent since 
	// LLUrlEntryAgent is a less specific (catchall for agent urls)
	registerUrl(new LLUrlEntryAgent());
	registerUrl(new LLUrlEntryGroup());
	registerUrl(new LLUrlEntryParcel());
	registerUrl(new LLUrlEntryTeleport());
	registerUrl(new LLUrlEntryRegion());
	registerUrl(new LLUrlEntryWorldMap());
	registerUrl(new LLUrlEntryObjectIM());
	registerUrl(new LLUrlEntryPlace());
	registerUrl(new LLUrlEntryInventory());
	registerUrl(new LLUrlEntryObjectIM());
	//LLUrlEntrySL and LLUrlEntrySLLabel have more common pattern, 
	//so it should be registered in the end of list
	registerUrl(new LLUrlEntrySL());
	mUrlEntrySLLabel = new LLUrlEntrySLLabel();
	registerUrl(mUrlEntrySLLabel);
	// most common pattern is a URL without any protocol,
	// e.g., "secondlife.com"
	registerUrl(new LLUrlEntryHTTPNoProtocol());	
}

LLUrlRegistry::~LLUrlRegistry()
{
	// free all of the LLUrlEntryBase objects we are holding
	std::vector<LLUrlEntryBase *>::iterator it;
	for (it = mUrlEntry.begin(); it != mUrlEntry.end(); ++it)
	{
		delete *it;
	}
}

void LLUrlRegistry::registerUrl(LLUrlEntryBase *url, bool force_front)
{
	if (url)
	{
		if (force_front)  // IDEVO
			mUrlEntry.insert(mUrlEntry.begin(), url);
		else
		mUrlEntry.push_back(url);
	}
}

static bool matchRegex(const char *text, boost::regex regex, U32 &start, U32 &end)
{
	boost::cmatch result;
	bool found;

	// regex_search can potentially throw an exception, so check for it
	try
	{
		found = boost::regex_search(text, result, regex);
	}
	catch (std::runtime_error &)
	{
		return false;
	}

	if (! found)
	{
		return false;
	}

	// return the first/last character offset for the matched substring
	start = static_cast<U32>(result[0].first - text);
	end = static_cast<U32>(result[0].second - text) - 1;

	// we allow certain punctuation to terminate a Url but not match it,
	// e.g., "http://foo.com/." should just match "http://foo.com/"
	if (text[end] == '.' || text[end] == ',')
	{
		end--;
	}
	// ignore a terminating ')' when Url contains no matching '('
	// see DEV-19842 for details
	else if (text[end] == ')' && std::string(text+start, end-start).find('(') == std::string::npos)
	{
		end--;
	}

	else if (text[end] == ']' && std::string(text+start, end-start).find('[') == std::string::npos)
	{
			end--;
	}

	return true;
}

static bool stringHasUrl(const std::string &text)
{
	// fast heuristic test for a URL in a string. This is used
	// to avoid lots of costly regex calls, BUT it needs to be
	// kept in sync with the LLUrlEntry regexes we support.
	return (text.find("://") != std::string::npos ||
			text.find("www.") != std::string::npos ||
			text.find(".com") != std::string::npos ||
			text.find(".net") != std::string::npos ||
			text.find(".edu") != std::string::npos ||
			text.find(".org") != std::string::npos ||
			text.find("<nolink>") != std::string::npos ||
			text.find("<icon") != std::string::npos);
}

bool LLUrlRegistry::findUrl(const std::string &text, LLUrlMatch &match, const LLUrlLabelCallback &cb, bool is_content_trusted)
{
	// avoid costly regexes if there is clearly no URL in the text
	if (! stringHasUrl(text))
	{
		return false;
	}

	// find the first matching regex from all url entries in the registry
	U32 match_start = 0, match_end = 0;
	LLUrlEntryBase *match_entry = NULL;

	std::vector<LLUrlEntryBase *>::iterator it;
	for (it = mUrlEntry.begin(); it != mUrlEntry.end(); ++it)
	{
		//Skip for url entry icon if content is not trusted
		if(!is_content_trusted && (mUrlEntryIcon == *it))
		{
			continue;
		}

		LLUrlEntryBase *url_entry = *it;

		U32 start = 0, end = 0;
		if (matchRegex(text.c_str(), url_entry->getPattern(), start, end))
		{
			// does this match occur in the string before any other match
			if (start < match_start || match_entry == NULL)
			{

				if((mLLUrlEntryInvalidSLURL == *it))
				{
					if(url_entry && url_entry->isSLURLvalid(text.substr(start, end - start + 1)))
					{
						continue;
					}
				}

				if((mUrlEntryHTTPLabel == *it) || (mUrlEntrySLLabel == *it))
				{
					if(url_entry && !url_entry->isWikiLinkCorrect(text.substr(start, end - start + 1)))
					{
						continue;
					}
				}

				match_start = start;
				match_end = end;
				match_entry = url_entry;
			}
		}
	}
	
	// did we find a match? if so, return its details in the match object
	if (match_entry)
	{
		// fill in the LLUrlMatch object and return it
		std::string url = text.substr(match_start, match_end - match_start + 1);

		LLUriParser up(url);
		up.normalize();
		url = up.normalizedUri();

		match.setValues(match_start, match_end,
						match_entry->getUrl(url),
						match_entry->getLabel(url, cb),
						match_entry->getQuery(url),
						match_entry->getTooltip(url),
						match_entry->getIcon(url),
						match_entry->getStyle(),
						match_entry->getMenuName(),
						match_entry->getLocation(url),
						match_entry->getID(url),
						match_entry->underlineOnHoverOnly(url),
						match_entry->isTrusted());
		return true;
	}

	return false;
}

bool LLUrlRegistry::findUrl(const LLWString &text, LLUrlMatch &match, const LLUrlLabelCallback &cb)
{
	// boost::regex_search() only works on char or wchar_t
	// types, but wchar_t is only 2-bytes on Win32 (not 4).
	// So we use UTF-8 to make this work the same everywhere.
	std::string utf8_text = wstring_to_utf8str(text);
	if (findUrl(utf8_text, match, cb))
	{
		// we cannot blindly return the start/end offsets from
		// the UTF-8 string because it is a variable-length
		// character encoding, so we need to update the start
		// and end values to be correct for the wide string.
		LLWString wurl = utf8str_to_wstring(match.getUrl());
		S32 start = text.find(wurl);
		if (start == std::string::npos)
		{
			return false;
		}
		S32 end = start + wurl.size() - 1;

		match.setValues(start, end, match.getUrl(), 
						match.getLabel(),
						match.getQuery(),
						match.getTooltip(),
						match.getIcon(),
						match.getStyle(),
						match.getMenuName(),
						match.getLocation(),
						match.getID(),
						match.underlineOnHoverOnly());
		return true;
	}
	return false;
}

bool LLUrlRegistry::hasUrl(const std::string &text)
{
	LLUrlMatch match;
	return findUrl(text, match);
}

bool LLUrlRegistry::hasUrl(const LLWString &text)
{
	LLUrlMatch match;
	return findUrl(text, match);
}

bool LLUrlRegistry::isUrl(const std::string &text)
{
	LLUrlMatch match;
	if (findUrl(text, match))
	{
		return (match.getStart() == 0 && match.getEnd() >= text.size()-1);
	}
	return false;
}

bool LLUrlRegistry::isUrl(const LLWString &text)
{
	LLUrlMatch match;
	if (findUrl(text, match))
	{
		return (match.getStart() == 0 && match.getEnd() >= text.size()-1);
	}
	return false;
}