Wednesday, March 21, 2012

Pattern extraction table value function

I need a table value function for extracting strings matching a particular pattern from a long string.

e.g. I have a table called cs_Posts, it has a column called FormattedBody, this value can be something like:

Code Snippet

this is the 1st photo <a href="http://jvcwebdev:81/cs/blogs/hllee/DSC00884.JPGhttp://jvcwebdev:81/cs/blogs/hllee/DSC00884.JPG">http://jvcwebdev:81/cs/blogs/hllee/DSC00884.JPG</A< A>>" border="0" alt="" /></a>


this is the 2nd photo<a href="http://jvcwebdev:81/cs/blogs/hllee/scene%20photos/DSC00859.JPGhttp://jvcwebdev:81/cs/blogs/hllee/scene%20photos/DSC00859.JPG">http://jvcwebdev:81/cs/blogs/hllee/scene%20photos/DSC00859.JPG</A< A>>" border="0" alt="" /></a>

When this long string is input, the table value function should 2 rows:
http://jvcwebdev:81/cs/blogs/hllee/DSC00884.JPG
http://jvcwebdev:81/cs/blogs/hllee/scene%20photos/DSC00859.JPG

Any idea?

This is just a sample,

Code Snippet

Create table Utility_Numbers (Number int);

Declare @.I as Int;

Set @.I = 1

While @.I<=8000

Begin

Insert Into Utility_Numbers Values(@.I);

Set @.I = @.I + 1;

End

Go

Code Snippet

Create Function GetUrls

(

@.html as varchar(8000)

) returns @.result Table(URL varchar(1000))

as

Begin

Set @.html = char(10) + @.html + char(10)

Declare @.Table Table (data varchar(1000))

Insert Into @.Table

Select Substring(@.html,number,charindex(char(10),@.html,number+1) - number) from Utility_Numbers where number<len(@.html)

and substring(@.html,number,1)=char(10)

Insert Into @.result

Select Substring(data,charindex('>http://',data)+1,charindex('</'< SPAN>,data,charindex('>http://',data)+1)-charindex('>http://',data)-1) from @.Table

return;

End

Go

Code Snippet

Select * from GetUrls('

this is the 1st photo http://jvcwebdev:81/cs/blogs/hllee/DSC00884.JPG</A< A>>" border="0" alt="" />

this is the 2nd photohttp://jvcwebdev:81/cs/blogs/hllee/scene%20photos/DSC00859.JPG</A< A>>" border="0" alt="" />

')

|||

Thanks Manivannan. But I got my answer now

Code Snippet

CREATE FUNCTION ExtractHTMLImgs
(
@.html nvarchar(max)
)
RETURNS
@.result TABLE
(
imgLink nvarchar(4000),
ordinal int
)
WITH SCHEMABINDING
AS
BEGIN
-- image link prefix & suffix
DECLARE @.imgPrefix char(10)
SET @.imgPrefix = '<img src="'
DECLARE @.imgSuffix char(2)
SET @.imgSuffix = '" '

-- image link ordinal
DECLARE @.ordinal int
SET @.ordinal = 1

-- searching positions
DECLARE @.imgStartPos int
SET @.imgStartPos = 1
DECLARE @.imgEndPos int
SET @.imgEndPos = 1

-- extract image links
WHILE @.imgEndPos < LEN(@.html)
BEGIN
-- search the image-link-prefix, starting from the last found image-end
SET @.imgStartPos = CHARINDEX(@.imgPrefix, @.html, @.imgEndPos)

IF @.imgStartPos = 0
BEGIN
-- NO more image link => STOP
BREAK
END
ELSE
BEGIN
-- image found => get the image-link-start-position
SET @.imgStartPos = @.imgStartPos + LEN(@.imgPrefix)

-- search the image-link-suffix, starting from the current image-start
SET @.imgEndPos = CHARINDEX(@.imgSuffix, @.html, @.imgStartPos)

-- populate results
INSERT @.result VALUES (
SUBSTRING(@.html, @.imgStartPos, (@.imgEndPos - @.imgStartPos)),
@.ordinal
)

-- next
SET @.ordinal = @.ordinal + 1
END
END

RETURN
END

No comments:

Post a Comment