This script is used to monitor the redundancy of replicated mailbox databases by validating that there is at least two configured and healthy and current copies, and to alert you when only a single healthy copy of a replicated database exists.
Used in the following blog post:
http://msexchangeteam.com/archive/2010/05/20/454976.aspx
Script
# .SYNOPSIS
# Checks the redundancy of databases by validating that they have at least N
# configured and "healthy" copies. Active and passive copies are both counted.
# .DESCRIPTION
#
# Copyright (c) 2010 Microsoft Corporation. All rights reserved.
#
# THIS CODE IS MADE AVAILABLE AS IS, WITHOUT WARRANTY OF ANY KIND. THE ENTIRE RISK
# OF THE USE OR THE RESULTS FROM THE USE OF THIS CODE REMAINS WITH THE USER.
# To use this script you need to provide either $MailboxDatabaseName or $MailboxServerName.
# To generate events for Monitoring, you need to provide -MonitoringContext switch.
[CmdletBinding(DefaultParametersetName="Server")]
param(
[Parameter(ParameterSetName="Database",Mandatory=$true,Position=0)]
[string] $MailboxDatabaseName,
# By default, check against the local server
[Parameter(ParameterSetName="Server",Position=0)]
[string] $MailboxServerName = $env:COMPUTERNAME,
# Skip checking the "default" mailbox databases. eg: Mailbox Database 0017891750
# Specify $null (or an empty string) if you don't want to skip any databases.
[Parameter(ParameterSetName="Server")]
[string] $SkipDatabasesRegex = "^Mailbox Database \d{10}$",
[Parameter(ParameterSetName="Monitoring",Mandatory=$true)]
[Parameter(ParameterSetName="Database")]
[Parameter(ParameterSetName="Server")]
[switch] $MonitoringContext = $false,
[Parameter(ParameterSetName="Monitoring")]
[Parameter(ParameterSetName="Database")]
[Parameter(ParameterSetName="Server")]
[UInt32] $SleepDurationBetweenIterationsSecs = 60,
[Parameter(ParameterSetName="Monitoring")]
[Parameter(ParameterSetName="Database")]
[Parameter(ParameterSetName="Server")]
[Int32] $TerminateAfterDurationSecs = 3480, # 58 minutes; -1,0 are "Infinite"
[Parameter(ParameterSetName="Monitoring")]
[Parameter(ParameterSetName="Database")]
[Parameter(ParameterSetName="Server")]
[UInt32] $SuppressGreenEventForSecs = 600, # 10 minutes
# If the total duration of being "red" exceeds this amount, raise the Red event
[Parameter(ParameterSetName="Monitoring")]
[Parameter(ParameterSetName="Database")]
[Parameter(ParameterSetName="Server")]
[UInt32] $ReportRedEventAfterDurationSecs = 1200, # 20 minutes
# Once we raise a red event, report it periodically every $ReportRedEventIntervalSecs seconds.
[Parameter(ParameterSetName="Monitoring")]
[Parameter(ParameterSetName="Database")]
[Parameter(ParameterSetName="Server")]
[UInt32] $ReportRedEventIntervalSecs = 900, # 15 minutes
[Parameter(ParameterSetName="Monitoring")]
[Parameter(ParameterSetName="Database")]
[Parameter(ParameterSetName="Server")]
[switch] $SkipEventLogging = $false,
[UInt32] $AtLeastNCopies = 2,
# If false, detailed summary status is left out of the events/objects reported
[switch] $ShowDetailedErrors = $false,
# The email FROM address to use for the summary report
[string] $SummaryMailFrom = $null,
# Send a summary report email to the following addresses
[string[]] $SendSummaryMailTos = $null,
# Useful to "dot-source" this script as a library - call the script as such:
# PS D:\Exchange Mailbox\v14\Scripts> . .\CheckDatabaseRedundancy.ps1 -DotSourceMode
[Parameter(ParameterSetName="DotSourceMode",Mandatory=$true)]
[switch] $DotSourceMode = $false
)
Set-StrictMode -Version 2.0
function LoadExchangeSnapin
{
if (! (Get-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010 -ErrorAction:SilentlyContinue) )
{
Add-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010
}
}
LoadExchangeSnapin
#---------------------------------------
# Aliases for commonly used enum types #
#---------------------------------------
$CopyStatusType = [Microsoft.Exchange.Management.SystemConfigurationTasks.CopyStatus]
$ReplicationTypeType = [Microsoft.Exchange.Data.Directory.SystemConfiguration.ReplicationType]
$MountDialType = [Microsoft.Exchange.Data.Directory.SystemConfiguration.AutoDatabaseMountDial]
#------------
# Constants #
#------------
# This is the maximum copy queue length considered "healthy" for a passive copy.
# Currently, this value is 12 as defined by BestAvailability.
$CopyQueueLengthThreshold = [int]$MountDialType::BestAvailability
$InspectorQueueLengthWarningThreshold = $CopyQueueLengthThreshold
$InspectorQueueLengthFailedThreshold = 1000
$ReplayQueueLengthWarningThreshold = 500
#-------------------
# Script variables #
#-------------------
[System.Diagnostics.Stopwatch] $script:copyStatusStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch
[System.Diagnostics.Stopwatch] $script:copyStatusAllStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch
[System.Diagnostics.Stopwatch] $script:clusterNodeStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch
[System.Diagnostics.Stopwatch] $script:clusterNodeOverallStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch
[System.Diagnostics.Stopwatch] $script:oneIterationStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch
$script:databaseToStatusTable = @{} # Hashtable indexed by DatabaseName, value of Collection<DatabaseCopyStatusEntry>
$script:databasesToCheckTable = @{} # Hashtable for the databases that we want to check, indexed by DatabaseName
$script:databaseStateTable = @{} # Hashtable indexed by DatabaseName, that holds the redundancy state of each DB
$script:clusterNodeStateTable = @{} #Hashtable indexed by server name, that holds the cluster node state (Up, Down, Joining, Paused, Unknown)
$script:outputObjects = @() # List of objects to send to the output pipeline
[Microsoft.Exchange.Data.Directory.Management.MailboxServer] $script:mailboxServer = $null
[UInt64] $script:iteration = 0
$script:clusterOutput = $null
[string]$script:dagName = $null
[System.Text.StringBuilder]$script:report = $null
$script:IsDataCenterLibraryPresent = $false
function Is-DatabaseReplicated ([Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase] $mdb)
{
if ($mdb.ReplicationType -eq $ReplicationTypeType::Remote)
{
return $true;
}
return $false;
}
function Is-DagServerOnline ([string] $serverName)
{
Log-Verbose "Is-DagServerOnline: Entering: `$serverName=$serverName"
[bool]$isOnline = $false
# This is locale-dependent.
if ($script:clusterNodeStateTable.Contains($serverName) -and
$script:clusterNodeStateTable[$serverName] -ieq "Up")
{
$isOnline = $true
}
Log-Verbose "Is-DagServerOnline: Leaving (returning '$isOnline')"
return $isOnline
}
# The following states are possibly healthy for a passive copy:
# Healthy, DisconnectedAndHealthy, SeedingSource
function Is-PassiveCopyPossiblyHealthy ([Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $status)
{
$healthy = $false;
switch ($status.Status)
{
$CopyStatusType::Healthy { $healthy = $true }
$CopyStatusType::DisconnectedAndHealthy { $healthy = $true }
$CopyStatusType::SeedingSource { $healthy = $true }
default { }
}
return $healthy
}
function Is-ActiveReplayServiceDown ([string] $databaseName)
{
Log-Verbose "Is-ActiveReplayServiceDown: '$databaseName': Entering..."
[Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @()
$statuses = $script:databaseToStatusTable[$databaseName]
# guarantees that $activeStatus is a single entry, instead of a collection
$activeStatus = $statuses | where { $_.ActiveCopy } | select -First 1
if (!$activeStatus)
{
Log-Verbose "Is-ActiveReplayServiceDown: '$databaseName': No ActiveCopy found! Assuming it has replay service down."
return $true
}
if ($activeStatus.Status -eq $CopyStatusType::ServiceDown)
{
Log-Verbose "Is-ActiveReplayServiceDown: Active copy '$($activeStatus.Name)' has replay service down."
return $true
}
Log-Verbose "Is-ActiveReplayServiceDown: '$databaseName': Leaving, returning 'False'"
return $false
}
function Populate-DatabasesTable ([Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase[]] $databases)
{
$script:databasesToCheckTable.Clear()
Foreach ($database in $databases)
{
$script:databasesToCheckTable[$database.Name] = $database
}
}
# Queries the cluster remotely for the node status, using the server name specified.
# Returns $true if the cluster.exe command completed successfully; $false otherwise.
# Additionally, if $true is being returned, $clusterNodeStateTable is updated.
function Get-ClusterNodeStatus ([string] $clusterName)
{
$script:clusterNodeStateTable.Clear()
$script:clusterNodeStopwatch.Reset();
$script:clusterNodeStopwatch.Start();
[bool]$success = $false
$script:clusterOutput = $null
# Run the cluster command, instead of relying on "get-DAG -Status" since that makes
# a replay RPC and the replay service could be down.
# NOTE: This command can take a while (30-45 secs) when the quorum is lost, or netname is down.
Log-Verbose "Get-ClusterNodeStatus: Running command: cluster.exe /cluster:$clusterName node"
$clusCommand =
{
# The '2>&1' redirects errors to the success stream. See 'help about_redirection' for more info.
$script:clusterOutput = ( ( cluster.exe /cluster:$clusterName node ) 2>&1 )
}
$dummy = TryExecute-ScriptBlock -runCommand $clusCommand -silentOnErrors $true
if ( $LastExitCode -eq 1722 )
{
Log-Verbose "Get-ClusterNodeStatus: cluster.exe failed to contact the cluster. RPC_S_SERVER_UNAVAILABLE. The cluster netname or node may be down, or quorum lost."
}
elseif ( $LastExitCode -ne 0 )
{
Log-Verbose "Get-ClusterNodeStatus: cluster.exe did not succced. Return value $LastExitCode. `nOutput: $($script:clusterOutput)"
}
else
{
$success = $true
}
if ($success)
{
Log-Verbose "Get-ClusterNodeStatus: cluster.exe returned the following output:`n $($script:clusterOutput)";
# Sample output:
#
# Listing status for all available nodes:
#
# Node Node ID Status
# -------------- ------- ---------------------
# EXCH-I-782 1 Up
# EXCH-D-770 2 Up
# EXCH-D-772 3 Down
$match = select-string -pattern "(?<server>\S+)\s+\d+\s+(?<state>\S+)" -inputobject ($script:clusterOutput) -allmatches
if ( $match -ne $null )
{
$match.Matches | `
foreach { $script:clusterNodeStateTable.Add( $_.Groups["server"].Value.Trim(), $_.Groups["state"].Value.Trim() ) }
}
else
{
$success = $false
Log-Verbose "Get-ClusterNodeStatus: cluster.exe output returned no regex matches!"
}
}
$script:clusterNodeStopwatch.Stop()
Log-Verbose "Get-ClusterNodeStatus: cluster.exe operation completed in $($script:clusterNodeStopwatch.Elapsed.TotalMilliseconds) ms. Returning '$success'."
return $success
}
# Queries the cluster node states for all cluster servers via the cluster group name (which is the DAG name).
# It then parses the output into a hashtable indexed by server name, with the cluster node state for each (i.e. Up, Down, etc.)
function Populate-ClusterNodeStatus
{
$script:dagName = $null
$script:clusterNodeStateTable.Clear()
# Find the DAG name first.
if ($script:mailboxServer)
{
$script:dagName = $script:mailboxServer.DatabaseAvailabilityGroup.Name
}
else
{
# running in DB mode, which means there should only be one DB in this table
$db = $script:databasesToCheckTable.Values | select -First 1
$script:dagName = $db.MasterServerOrAvailabilityGroup.Name
}
Log-Verbose "Populate-ClusterNodeStatus: Found DAG '$($script:dagName)'."
$script:clusterNodeOverallStopwatch.Reset();
$script:clusterNodeOverallStopwatch.Start();
# First, try with the cluster netname (which is the DAG name)
[bool]$success = Get-ClusterNodeStatus $script:dagName
[Microsoft.Exchange.Data.Directory.SystemConfiguration.DatabaseAvailabilityGroup]$dag = $null
if (!$success)
{
Log-Verbose "Populate-ClusterNodeStatus: Failed to query the cluster using the cluster netname of '$($script:dagName)'! Querying DAG member servers instead."
$dag = Get-DatabaseAvailabilityGroup $script:dagName
if ($dag)
{
if (!$dag.Servers -or `
($dag.Servers.Count -eq 0))
{
Log-Verbose "Populate-ClusterNodeStatus: DAG '$($script:dagName)' contains no servers!"
}
else
{
# Pick a random server to start querying
[int]$index = [int]([System.Environment]::TickCount % $dag.Servers.Count)
Log-Verbose "Populate-ClusterNodeStatus: Enumerating DAG servers starting at index='$index', server='$($dag.Servers[$index])'."
# $REVIEW: If the below syntax is used, 'break' doesn't exit out of the 'foreach'.
# Instead, in this case, break exits out of the entire script! Scary... I'll stick to for loops for now.
# 1..$dag.Servers.Count | foreach `
for ($iteration = 1; $iteration -le $dag.Servers.Count; $iteration++)
{
$serverName = $dag.Servers[$index]
$success = Get-ClusterNodeStatus $serverName
if ($success)
{
break
}
else
{
$index = ($index + 1) % $dag.Servers.Count
}
}
}
}
else
{
Log-Verbose "Populate-ClusterNodeStatus: Could not find DAG '$($script:dagName)'! Marking all nodes as 'Down'."
}
}
if (!$success)
{
# Mark all the DAG members as being down.
# In case $dag is null, Is-DagServerOnline will return $false since the table should be empty
if ($dag)
{
$dag.Servers | foreach { $script:clusterNodeStateTable.Add( $_.Name, "Down" ) }
}
}
$script:clusterNodeOverallStopwatch.Stop()
Log-Verbose "Populate-ClusterNodeStatus: Overall operation completed in $($script:clusterNodeOverallStopwatch.Elapsed.TotalMilliseconds) ms."
}
function Check-Databases ([Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase[]] $databases, [string] $ParameterSetName)
{
Populate-DatabasesTable $databases
# find the servers to check copy statuses on
[String[]]$servers = Get-ServersForDatabases $databases
if ($servers.Length -lt 2)
{
# Normally we should not get here, since we're only checking replicated DBs, which means
# we should have at least 2 distinct servers. However, this can happen if copies are
# removed while this script is running.
Log-Warning "Get-ServersForDatabases returned only '$($servers.Length)' servers."
}
# get the status results and index them by database name
$script:databaseToStatusTable.Clear()
$script:databaseToStatusTable = Get-CopyStatusFromAllServers $servers $ParameterSetName | `
Group-Object -AsHashTable -Property DatabaseName
# look up the cluster node status for the DAG
Populate-ClusterNodeStatus
Log-Verbose "Check-Databases: Filtering out databases we are not going to check..."
# Filter out the databases we are not going to check, and then perform the redundancy check.
$script:databaseToStatusTable.Keys | `
where { $script:databasesToCheckTable.Contains( $_ ) } | `
foreach { Check-DatabaseRedundancy $_ }
# NOTE:
# If the DB is completely removed from AD while this script is running, we may keep reporting
# a Red alert for it. If need be, we can log a green event for the DB in this case...
}
# This object represents a DB's redundancy state. It is initialized once at script startup and
# is subsequently maintained over multiple passes of Check-Databases.
function CreateEmptyDatabaseRedundancyEntry
{
[CheckHADatabaseRedundancy.DatabaseRedundancyEntry]$entry = New-Object -TypeName "CheckHADatabaseRedundancy.DatabaseRedundancyEntry"
return $entry
}
function Initialize-DatabaseRedundancyEntry ([CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbRedundancy)
{
$dbRedundancy.LastRedundancyCount = $dbRedundancy.CurrentRedundancyCount
$dbRedundancy.LastState = $dbRedundancy.CurrentState
$dbRedundancy.CurrentRedundancyCount = 0
$dbRedundancy.CurrentState = "Unknown"
$dbRedundancy.CurrentErrorMessages = $null
}
function Get-SummaryCopyStatusString(
[Parameter(Mandatory=$true,ValueFromPipeline=$true)] [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $status)
{
Begin
{
$statusOutputs = @()
}
Process
{
$statusOutput = $status | Select-Object *,`
@{Name="RealCopyQueue"; Expression={ [Math]::Max(0, $_.LastLogGenerated - $_.LastLogCopied) }}, `
@{Name="InspectorQueue"; Expression={ [Math]::Max(0, $_.LastLogCopied - $_.LastLogInspected) }}, `
@{Name="ReplayQueue"; Expression={ $_.ReplayQueueLength }}, `
@{Name="CIState"; Expression={ $_.ContentIndexState }}
$statusOutputs += $statusOutput
}
End
{
[string]$statusStr = ($statusOutputs | ft -Wrap Name,Status,RealCopyQueue,InspectorQueue,ReplayQueue,CIState | Out-String)
$statusStr = $statusStr -replace "\s+$" # trim the white space at the end
Write-Output $statusStr
}
}
# Logic to decide if a DB has insufficient redundancy
function Check-DatabaseRedundancy ([string] $dbName)
{
Log-Verbose "Check-DatabaseRedundancy: '$dbName': Entering..."
# Initialize the DB redundancy state if necessary
if (!$script:databaseStateTable.Contains($dbName))
{
$dbState = CreateEmptyDatabaseRedundancyEntry
$dbState.DatabaseName = $dbName
$script:databaseStateTable.Add($dbName, $dbState)
Log-Verbose "Check-DatabaseRedundancy: '$dbName': Created empty DB redundancy state entry."
}
# Retrieve the redundancy state object, and initialize states
$dbRedundancy = $script:databaseStateTable[$dbName]
Initialize-DatabaseRedundancyEntry $dbRedundancy
# Get the list of copy status entries from the hashtable
[Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @()
$statuses = $script:databaseToStatusTable[$dbName]
[string[]] $tmpErrMessages = @()
[string] $errMsg = $null
[string] $summaryStatusStr = $null
[CheckHADatabaseRedundancy.CopyCheckState]$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Unknown
# In case there's only one configured copy, let's report that as an error
if ($statuses.Count -lt $AtLeastNCopies)
{
$tmpErrMessages += "The number of configured copies for database '$dbName' ($($statuses.Count)) is less than the required redundancy count ($AtLeastNCopies)."
}
Foreach ($status in $statuses)
{
# Check the health of the active or passive copy
($errMsg,$checkState) = Get-DatabaseCopyHealth $status
if ($checkState -eq [CheckHADatabaseRedundancy.CopyCheckState]::Passed)
{
$dbRedundancy.CurrentRedundancyCount++
}
elseif ($checkState -eq [CheckHADatabaseRedundancy.CopyCheckState]::Warning)
{
$dbRedundancy.CurrentRedundancyCount++
$tmpErrMessages += $errMsg
}
else
{
# This copy has failed the check, so let's record the reason why
$tmpErrMessages += $errMsg
}
}
# If we've got some errors, remember them for emailing purposes
if ($tmpErrMessages.Length -gt 0)
{
# Append the summary status
$summaryStatusStr = ($statuses | Get-SummaryCopyStatusString)
$tmpErrMessages += "$summaryStatusStr"
# Add the overall errors to the history for this DB
$dbRedundancy.AddErrorRecordToHistory( [DateTime]::UtcNow, $tmpErrMessages )
}
if ($ShowDetailedErrors)
{
# Additionally, log the copy status output into the event
$statusStr = $summaryStatusStr
if (!$statusStr)
{
$statusStr = ($statuses | Get-SummaryCopyStatusString)
if ($statusStr)
{
$tmpErrMessages += "`n`n================`n Summary Status `n================`n`n$statusStr"
}
}
$statusStr = ($statuses | fl | Out-String)
if ($statusStr)
{
$tmpErrMessages += "`n`n===============`n Full Status `n===============`n`n$statusStr"
}
}
if ($tmpErrMessages.Length -gt 0)
{
$dbRedundancy.CurrentErrorMessages = $tmpErrMessages
}
Log-Verbose "Check-DatabaseRedundancy: '$dbName': CurrentRedundancyCount=$($dbRedundancy.CurrentRedundancyCount), LastRedundancyCount=$($dbRedundancy.LastRedundancyCount)"
# Decide if the state is Red or Green
if ($dbRedundancy.CurrentRedundancyCount -lt $AtLeastNCopies)
{
Log-Verbose "Check-DatabaseRedundancy: '$dbName': Redundancy count is lower than specified threshold of '$AtLeastNCopies'. Setting the state to 'Red'."
$dbRedundancy.CurrentState = [CheckHADatabaseRedundancy.AlertState]::Red
}
else
{
$dbRedundancy.CurrentState = [CheckHADatabaseRedundancy.AlertState]::Green
}
# record the state transition times
[datetime]$nowUtc = [DateTime]::UtcNow
if ($dbRedundancy.IsTransitioningState)
{
$dbRedundancy.LastStateTransitionUtc = $nowUtc
if ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Green)
{
$dbRedundancy.LastGreenTransitionUtc = $nowUtc
if ($dbRedundancy.LastRedTransitionUtc)
{
[TimeSpan]$prevTimeInRed = $dbRedundancy.LastGreenTransitionUtc.Subtract( $dbRedundancy.LastRedTransitionUtc )
$dbRedundancy.PreviousTotalRedDuration = $dbRedundancy.PreviousTotalRedDuration.Add( $prevTimeInRed )
}
}
elseif ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Red)
{
$dbRedundancy.LastRedTransitionUtc = $nowUtc
}
}
# Report a red/green event if necessary (suppression may occur if MonitoringContext is specified)
PossiblyReport-RedGreenStatus $dbRedundancy
}
# Reports Red/Green status via mail/event etc, taking into account whether or not
# we are running in the MonitoringContext (which affects suppression)
function PossiblyReport-RedGreenStatus ( [CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbRedundancy )
{
if ($MonitoringContext)
{
# In the monitoring context, we should run the suppression logic
if ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Green)
{
[int]$timeInGreenSecs = Get-ElapsedTimeInSeconds $dbRedundancy.LastGreenTransitionUtc
if (($timeInGreenSecs -gt $SuppressGreenEventForSecs) -and `
($dbRedundancy.LastGreenReportedUtc -eq $null))
{
# Only log a green event once, or if it transitions into Green again
Report-GreenStatus $dbRedundancy
}
}
elseif ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Red)
{
# We need to log an event if the total duration of being in Red (including flickering states)
# is larger than $ReportRedEventAfterDurationSecs.
if ($dbRedundancy.TotalRedDuration.TotalSeconds -gt $ReportRedEventAfterDurationSecs)
{
# Reporting a red event for the first time
if ($dbRedundancy.LastRedReportedUtc -eq $null)
{
Report-RedStatus $dbRedundancy
}
else
{
# Additionally, we need to log an event every $ReportRedEventIntervalSecs seconds
# while the DB is in "Red".
[int]$timeSinceLastRedEventSecs = Get-ElapsedTimeInSeconds $dbRedundancy.LastRedReportedUtc
if ($timeSinceLastRedEventSecs -gt $ReportRedEventIntervalSecs)
{
Report-RedStatus $dbRedundancy
}
}
}
}
}
else
{
# No monitoring context, so no suppression
if ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Green)
{
Report-GreenStatus $dbRedundancy
}
elseif ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Red)
{
Report-RedStatus $dbRedundancy
}
}
}
# Reports Green status via mail/event as appropriate [no suppression].
function Report-GreenStatus ( [CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbRedundancy )
{
$dbRedundancy.LastGreenReportedUtc = [DateTime]::UtcNow
[string]$dbCopyForAlerting = Get-DatabaseCopyForAlerting $dbRedundancy.DatabaseName
[bool]$writeOutput = $true
Log-Verbose "Report-GreenStatus: Reporting a Green event for database copy '$dbCopyForAlerting'"
if ($MonitoringContext)
{
if (!$SkipEventLogging)
{
# Write green event log into Application log
$writeOutput = $false
# MonitoringDatabaseRedundancyCheckPassed - EventId 4114
Write-HAAppLogInformationEvent "40041012" 1 @($dbCopyForAlerting, $dbRedundancy.CurrentRedundancyCount, $dbRedundancy.GetErrorStringForAlerting())
}
}
if ($writeOutput)
{
Write-Output $dbRedundancy
}
}
# Reports Red status via mail/event as appropriate [no suppression].
function Report-RedStatus ( [CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbRedundancy )
{
$dbRedundancy.LastRedReportedUtc = [DateTime]::UtcNow
$dbRedundancy.LastGreenReportedUtc = $null
[string]$dbCopyForAlerting = Get-DatabaseCopyForAlerting $dbRedundancy.DatabaseName
[bool]$writeOutput = $true
Log-Verbose "Report-RedStatus: Reporting a RED event for database copy '$dbCopyForAlerting'!"
if ($MonitoringContext)
{
if (!$SkipEventLogging)
{
# Write red event log into Application log
$writeOutput = $false
# MonitoringDatabaseRedundancyCheckFailed - EventId 4113
Write-HAAppLogErrorEvent "C0041011" 1 @($dbCopyForAlerting, $dbRedundancy.CurrentRedundancyCount, $dbRedundancy.GetErrorStringForAlerting())
}
}
if ($writeOutput)
{
Write-Output $dbRedundancy
}
}
function Get-DatabaseCopyForAlerting ( [string] $dbName )
{
# We need to report the Database name (not the DBCopy name)
return $dbName
}
function Get-ElapsedTimeInSeconds( [DateTime] $startTimeUtc )
{
[TimeSpan]$elapsedTime = [DateTime]::UtcNow.Subtract( $startTimeUtc )
[int]$elapsedSeconds = [int][System.Math]::Floor($elapsedTime.TotalSeconds)
return $elapsedSeconds
}
# Returns the string describing why the copy was not healthy. $null if the copy is healthy.
function Get-DatabaseCopyHealth ([Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $copyStatus)
{
if ($copyStatus.ActiveCopy)
{
return Get-ActiveDatabaseCopyHealth $copyStatus
}
else
{
return Get-PassiveDatabaseCopyHealth $copyStatus
}
}
# Returns the string describing why the copy was not healthy. $null if the copy is healthy.
function Get-ActiveDatabaseCopyHealth ( `
[Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $copyStatus)
{
$dbCopy = $copyStatus.Name
$dbName = $copyStatus.DatabaseName
$server = $copyStatus.MailboxServer
[string]$errMsg = $null
[CheckHADatabaseRedundancy.CopyCheckState]$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Unknown
Log-Verbose "Get-ActiveDatabaseCopyHealth: Active copy '$dbCopy' has Status:$($copyStatus.Status), ErrorEventId:$($copyStatus.ErrorEventId), `nErrorMessage: $($copyStatus.ErrorMessage), `nSuspendComment: $($copyStatus.SuspendComment)"
# Log that the DB is not replicated
if ($VerbosePreference -ne "SilentlyContinue")
{
if ( !(Is-DatabaseReplicated ($script:databasesToCheckTable[$dbName])) )
{
Log-Verbose "Get-ActiveDatabaseCopyHealth: Database '$dbName' is NOT replicated. It has only 1 copy configured in the AD."
}
}
# First, we need the cluster node status to be Up
if (!(Is-DagServerOnline $server))
{
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed
$errMsg = "Active copy '$dbCopy' is not UP according to clustering."
Log-Verbose "Get-ActiveDatabaseCopyHealth: $errMsg Returning 'Failed'."
return $errMsg,$checkState
}
# If replay service is down, we'll just assume that this copy isn't healthy since
# all the passives will be "stalled", which means we're anyway down to at most 1 copy.
if ($copyStatus.Status -eq $CopyStatusType::ServiceDown)
{
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed
$errMsg = "Active copy '$dbCopy' has replay service down. Assuming the copy is unhealthy."
Log-Verbose "Get-ActiveDatabaseCopyHealth: $errMsg Returning 'Failed'."
return $errMsg,$checkState
}
if ( ($copyStatus.Status -eq $CopyStatusType::Dismounted) `
-or ($copyStatus.Status -eq $CopyStatusType::Dismounting) )
{
# There may have been a permanent failure (such as a DB corruption) that is preventing
# the DB from mounting. If so, there will be an error message recorded.
if ($copyStatus.ErrorMessage)
{
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed
$errMsg = "Active copy '$dbCopy' is dismounted with an error. Error: $($copyStatus.ErrorMessage)."
Log-Verbose "Get-ActiveDatabaseCopyHealth: $errMsg Returning 'Failed'."
return $errMsg,$checkState
}
else
{
# A dismounted copy is fine
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Passed
return $errMsg,$checkState
}
}
if ($copyStatus.Status -eq $CopyStatusType::Mounted)
{
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Passed
return $errMsg,$checkState
}
elseif ($copyStatus.Status -eq $CopyStatusType::Mounting)
{
# NOTE: This is only a warning and doesn't cause RED alert to go off!
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Warning
$errMsg = "Active copy '$dbCopy' is in 'Mounting' state."
Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Warning'."
return $errMsg,$checkState
}
# Any other state, assume the worst
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed
$errMsg = "Active copy '$dbCopy' has some unknown/unhealthy state. Status: $($copyStatus.Status)."
Log-Verbose "Get-ActiveDatabaseCopyHealth: $errMsg Returning 'Failed'."
return $errMsg,$checkState
}
# Returns the string describing why the copy was not healthy. $null if the copy is healthy.
function Get-PassiveDatabaseCopyHealth ( `
[Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $copyStatus)
{
$dbCopy = $copyStatus.Name
$dbName = $copyStatus.DatabaseName
$server = $copyStatus.MailboxServer
$activeServer = $copyStatus.ActiveDatabaseCopy
[string]$errMsg = $null
[CheckHADatabaseRedundancy.CopyCheckState]$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Unknown
Log-Verbose "Get-PassiveDatabaseCopyHealth: Passive copy '$dbCopy' has Status:$($copyStatus.Status), CopyQueueLength=$($copyStatus.CopyQueueLength), ReplayQueueLength=$($copyStatus.ReplayQueueLength), DatabaseName=$dbName, ErrorEventId:$($copyStatus.ErrorEventId), `nErrorMessage: $($copyStatus.ErrorMessage), `nSuspendComment: $($copyStatus.SuspendComment)"
# First, we need the cluster node status to be Up
if (!(Is-DagServerOnline $server))
{
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed
$errMsg = "Passive copy '$dbCopy' is not UP according to clustering."
Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'."
return $errMsg,$checkState
}
# Rule out the obviously unhealthy cases first
if (!(Is-PassiveCopyPossiblyHealthy $copyStatus))
{
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed
$errMsg = "Passive copy '$dbCopy' is not in a good state. Status: $($copyStatus.Status)."
Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'."
return $errMsg,$checkState
}
# Check the *real* copy queue length first (i.e. not including the inspector queue)
[int]$realCopyQ = [Math]::Max(0, $copyStatus.LastLogGenerated - $copyStatus.LastLogCopied)
if ($realCopyQ -gt $CopyQueueLengthThreshold)
{
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed
$errMsg = "Passive copy '$dbCopy' has actual log copy queue higher than the threshold of '$CopyQueueLengthThreshold'. Copy queue: $realCopyQ."
Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'."
return $errMsg,$checkState
}
# Check the inspector queue length
[int]$inspectorQ = [Math]::Max(0, $copyStatus.LastLogCopied - $copyStatus.LastLogInspected)
if ($inspectorQ -gt $InspectorQueueLengthFailedThreshold)
{
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed
$errMsg = "Passive copy '$dbCopy' has an inspector queue higher than the failure threshold of '$InspectorQueueLengthFailedThreshold'. Inspector queue: $inspectorQ."
Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'."
return $errMsg,$checkState
}
# Even if the copy queue is small, we can't trust it because the active replay service might
# be down, in which case the queues will be stale. (E14# 138911)
# So, if the active status is "ServiceDown", but the node is up, we can be fairly certain
# that we shouldn't trust the queues.
if ((Is-ActiveReplayServiceDown $dbName) -and `
(Is-DagServerOnline $activeServer))
{
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed
$errMsg = "Passive copy '$dbCopy' has a small copy queue length, but it could be stale. The active replay service on server '$activeServer' appears to be down. Copy queue: $($copyStatus.CopyQueueLength)."
Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'."
return $errMsg,$checkState
}
if ($inspectorQ -gt $InspectorQueueLengthWarningThreshold)
{
# NOTE: This is only a warning and doesn't cause RED alert to go off!
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Warning
$errMsg = "Passive copy '$dbCopy' has an inspector queue higher than the warning threshold of '$InspectorQueueLengthWarningThreshold'. Inspector queue: $inspectorQ."
Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Warning'."
return $errMsg,$checkState
}
# Check the replay queue length
[int]$replayQ = [Math]::Max(0, $copyStatus.LastLogInspected - $copyStatus.LastLogReplayed)
if ($replayQ -gt $ReplayQueueLengthWarningThreshold)
{
# NOTE: This is only a warning and doesn't cause RED alert to go off!
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Warning
$errMsg = "Passive copy '$dbCopy' has a replay queue higher than the warning threshold of '$ReplayQueueLengthWarningThreshold'. Replay queue: $replayQ."
Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Warning'."
return $errMsg,$checkState
}
# The copy is good in the time alotted...
$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Passed
return $errMsg,$checkState
}
# Given a list of databases, find the set of unique servers hosting copies of all of them.
# Returns an array with all the server names.
function Get-ServersForDatabases ( `
[Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase[]] $databases)
{
$servers = @{}
Foreach ($db in $databases)
{
$db.Servers | % { `
if (!$servers.Contains($_.Name))
{
$servers.Add($_.Name, 1);
}
}
}
# convert the hashtable into an array
[String[]]$serversList = @()
$servers.Keys | % { $serversList += $_ }
Log-Verbose "Get-ServersForDatabases: returning '$($serversList.Length)' servers."
return $serversList;
}
# Runs get-mailboxdatabasecopystatus against copy(ies) on a server and returns an array of status results.
# Return type: Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]
function Get-CopyStatusFromServer ([string] $server, [string] $ParameterSetName)
{
Log-Verbose "Get-CopyStatusFromServer( $server ): Entering..."
$script:copyStatusStopwatch.Reset();
$script:copyStatusStopwatch.Start();
[Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @()
if ($ParameterSetName -eq "Database" )
{
$statuses = @( Get-MailboxDatabaseCopyStatus "$MailboxDatabaseName\$server" )
}
else
{
$statuses = Get-MailboxDatabaseCopyStatus -Server $server
}
$script:copyStatusStopwatch.Stop();
Log-Verbose "Get-CopyStatusFromServer( $server ): operation completed in $($script:copyStatusStopwatch.Elapsed.TotalMilliseconds) ms."
return $statuses
}
# Synchronously executes get-mdbcs against all the specified servers and returns an
# array of type DatabaseCopyStatusEntry, which holds all the statuses returned.
function Get-CopyStatusFromAllServers ([String[]] $servers, [string] $ParameterSetName)
{
Log-Verbose "Get-CopyStatusFromAllServers: Entering..."
$script:copyStatusAllStopwatch.Reset();
$script:copyStatusAllStopwatch.Start();
[Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $allStatuses = @()
Foreach ($server in $servers)
{
[Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @()
$statuses = Get-CopyStatusFromServer $server $ParameterSetName
$allStatuses += $statuses
}
$script:copyStatusAllStopwatch.Stop();
Log-Verbose "Get-CopyStatusFromAllServers: operation completed in $($script:copyStatusAllStopwatch.Elapsed.TotalMilliseconds) ms."
return $allStatuses
}
# Wouldn't it be nice to be able to run get-mdbcs in parallel across all the servers?
# Unfortunately, when I tested this method against just 3 servers in the DAG, it took
# ~26 seconds (25854.3543ms) !!!
# $REVIEW: Is Start-Job supposed to be that slow? I suppose it could be faster if we
# reused the same PSSession. I'll look into "Invoke-Command -AsJob" in future, but for
# now, the overhead is definitely not worth it...
function Get-CopyStatusFromAllServersAsync ([String[]] $servers)
{
# We'll run Get-CopyStatus in parallel across all the servers so that we don't
# excessively slow down the status retrieval in case some servers are down.
[System.Management.Automation.PSRemotingJob[]] $asyncJobs = @()
$getStatusCmd =
{
Process
{
$tmpServer = $_;
if (! (Get-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010 -ErrorAction:SilentlyContinue) )
{
Add-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010
}
[Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @()
if ($PSCmdlet.ParameterSetName -eq "Database" )
{
$statuses = @( Get-MailboxDatabaseCopyStatus "$MailboxDatabaseName\$tmpServer" )
}
else
{
$statuses = @( Get-MailboxDatabaseCopyStatus -Server $tmpServer )
}
return $statuses
}
}
$sw = New-Object -TypeName System.Diagnostics.Stopwatch
$sw.Reset();
$sw.Start();
######
## <Timed portion>
Foreach ($server in $servers)
{
$asyncJobs += Start-Job -ScriptBlock $getStatusCmd -InputObject $server
}
Log-Verbose "Get-CopyStatusFromAllServersAsync: Started $($asyncJobs.Length) async jobs."
# wait on all of them to complete
Wait-Job $asyncJobs
$sw.Stop();
## </Timed portion>
######
Log-Verbose "Get-CopyStatusFromAllServersAsync: Async operations completed in $($sw.Elapsed.TotalMilliseconds) ms."
Foreach ($job in $asyncJobs)
{
$results = Receive-Job $job
Log-Verbose "`$results = $results"
$results
}
}
#######################################################################
# Dynamic code compiler logic
#######################################################################
function ConstructReferences([Array]$References)
{
#
# Build up a compiler params object...
$refs = @()
$refs.AddRange( @("${framework}\System.dll",
"${framework}\system.windows.forms.dll",
"${framework}\System.data.dll",
"${framework}\System.Drawing.dll",
"${framework}\System.Xml.dll"))
if (($References -ne $null) -and ($References.Count -ge 1))
{
foreach ($refAssembly in $References)
{
[string] $refTmp = $refAssembly
if ($refTmp.IndexOf("\") -eq -1)
{
$refTmp = "${framework}\$refTmp"
}
$refs.Add($refTmp);
}
}
return $refs
}
# Compile the types to be used for tracking the Database Redundancy state.
# The compilation is only performed once per runspace and is entirely in memory.
function Prepare-DatabaseRedundancyEntryDefinition
{
$code = '
using System;
using System.Collections.Generic;
using System.Diagnostics;
namespace CheckHADatabaseRedundancy
{
public enum AlertState : int
{
Unknown = 0,
Green,
Red
}
// Enum describing the state of an individual database copy. At the moment,
// both Passed and Warning are treated as having passed the redundancy check
// and hence CurrentRedundancyCount is incremented.
public enum CopyCheckState : int
{
Unknown = 0,
Passed,
Warning,
Failed
}
public class DatabaseRedundancyEntry
{
public class ErrorRecord
{
public DateTime ErrorTime { get; set; }
public string[] ErrorMessages { get; set; }
public string GetErrorStringForAlerting()
{
if (ErrorMessages == null || ErrorMessages.Length == 0)
{
return String.Empty;
}
return String.Join("\n", ErrorMessages);
}
}
public string DatabaseName { get; set; }
public int LastRedundancyCount { get; set; }
public int CurrentRedundancyCount { get; set; }
public AlertState LastState { get; set; }
public AlertState CurrentState { get; set; }
public DateTime? LastStateTransitionUtc { get; set; }
public DateTime? LastGreenTransitionUtc { get; set; }
public DateTime? LastRedTransitionUtc { get; set; }
public DateTime? LastGreenReportedUtc { get; set; }
public DateTime? LastRedReportedUtc { get; set; }
// the previous total red duration (not counting the current stretch of reds)
public TimeSpan PreviousTotalRedDuration { get; set; }
public TimeSpan TotalRedDuration
{
get
{
if (this.CurrentState == AlertState.Red)
{
// count the current duration of reds
return this.PreviousTotalRedDuration + (DateTime.UtcNow - this.LastRedTransitionUtc.Value);
}
else
{
return this.PreviousTotalRedDuration;
}
}
}
public bool IsTransitioningState
{
get { return LastState != CurrentState; }
}
public bool HasErrorsInHistory
{
get
{
if (this.ErrorHistory == null || this.ErrorHistory.Count == 0)
{
return false;
}
return true;
}
}
public string[] CurrentErrorMessages { get; set; }
public List<ErrorRecord> ErrorHistory { get; private set; }
public string GetErrorStringForAlerting()
{
if (CurrentErrorMessages == null || CurrentErrorMessages.Length == 0)
{
return String.Empty;
}
return String.Join("\n", CurrentErrorMessages);
}
// Create a copy of the errorMessages array and then add the record to the history
public void AddErrorRecordToHistory(DateTime errorTime, string[] errorMessages)
{
string[] tmpMessages = new string[errorMessages.Length];
errorMessages.CopyTo(tmpMessages, 0);
ErrorRecord er = new ErrorRecord();
er.ErrorTime = errorTime;
er.ErrorMessages = tmpMessages;
if (this.ErrorHistory == null)
{
this.ErrorHistory = new List<ErrorRecord>(15);
}
this.ErrorHistory.Add(er);
}
}
public static class EventLogger
{
public static void WriteLocalizedEvent(
string logName, // eg: Application
string sourceName, // eg: MSExchangeRepl
long eventId, // Message resource ID: eg: (long)0xC0041011
int categoryId, // category of the event
EventLogEntryType entryType, // error, information, warning
byte[] data,
params object[] messageArgs)
{
EventLog eventLog = new EventLog(logName, Environment.MachineName, sourceName);
EventInstance instance = new EventInstance(
eventId,
categoryId,
entryType);
eventLog.WriteEvent(instance, data, messageArgs);
}
}
}'
$checkCompiledCmd =
{
# Check if the type is loaded. If not, a RuntimeException is thrown.
[CheckHADatabaseRedundancy.AlertState];
}
[bool]$isCompiled = TryExecute-ScriptBlock -runCommand:$checkCompiledCmd -silentOnErrors:$true
if (!$isCompiled)
{
##################################################################
# So now we compile the code and use .NET object access to run it.
##################################################################
Log-Verbose "Compiling code..."
Add-Type -TypeDefinition $code -Language "CSharpVersion3"
Log-Verbose "Done!"
}
}
function Write-HAAppLogInformationEvent(
[Parameter(Mandatory=$true)] [string] $eventId, # eg: "C0041011"
[Parameter(Mandatory=$true)] [int] $categoryId,
[Object[]] $messageArgs)
{
Write-LocalizedEventLog "Application" "MSExchangeRepl" $eventId $categoryId `
"Information" $null $messageArgs
}
function Write-HAAppLogWarningEvent(
[Parameter(Mandatory=$true)] [string] $eventId, # eg: "C0041011"
[Parameter(Mandatory=$true)] [int] $categoryId,
[Object[]] $messageArgs)
{
Write-LocalizedEventLog "Application" "MSExchangeRepl" $eventId $categoryId `
"Warning" $null $messageArgs
}
function Write-HAAppLogErrorEvent(
[Parameter(Mandatory=$true)] [string] $eventId, # eg: "C0041011"
[Parameter(Mandatory=$true)] [int] $categoryId,
[Object[]] $messageArgs)
{
Write-LocalizedEventLog "Application" "MSExchangeRepl" $eventId $categoryId `
"Error" $null $messageArgs
}
function Write-LocalizedEventLog(
[Parameter(Mandatory=$true)] [string] $logName,
[Parameter(Mandatory=$true)] [string] $sourceName,
[Parameter(Mandatory=$true)] [string] $eventId, # eg: "C0041011"
[Parameter(Mandatory=$true)] [int] $categoryId,
[Parameter(Mandatory=$true)] [System.Diagnostics.EventLogEntryType] $entryType,
[Byte[]] $data,
[Object[]] $messageArgs)
{
# parse the eventId into an Int64 first.
[Int64]$id = [Int64]::Parse($eventId, [System.Globalization.NumberStyles]::HexNumber)
[CheckHADatabaseRedundancy.EventLogger]::WriteLocalizedEvent( `
$logName, $sourceName, $id , $categoryId, $entryType, $data, $messageArgs)
}
# Common function to run a scriptblock, log any error that occurred, and return
# a boolean to indicate whether it was successful or not.
# NOTE: ErrorActionPreference of "Stop" is used to catch all errors.
#
# Optional parameters:
#
# cleanupCommand
# This scriptblock will be executed with ErrorActionPreference of "Continue",
# if an error occurred while running $runCommand.
#
# throwOnError
# If true, the error from $runCommand will be rethrown. Otherwise 'false' is returned on error.
#
# silentOnErrors
# If true, the error from $runCommand will not be logged via Log-ErrorRecord (i.e. Write-Error)
function TryExecute-ScriptBlock ([ScriptBlock]$runCommand, [ScriptBlock]$cleanupCommand={}, [bool]$throwOnError=$false, [bool]$silentOnErrors=$false)
{
# Run the following in a separate script block so that we can change
# ErrorActionPerefence without affecting the rest of the script.
&{
$ErrorActionPreference = "Stop"
[bool]$success = $false;
try
{
$ignoredObjects = @(&$runCommand)
$success = $true;
}
catch
{
# Any error will end up in this catch block
# For some reason, PS does not write out any errors unless I use this
# scriptblock with "Continue" ErrorActionPreference.
&{
$ErrorActionPreference = "Continue"
if (!$silentOnErrors)
{
Log-ErrorRecord $_
}
# Run the cleanup scriptblock
$ignoredObjects = @(&$cleanupCommand)
}
if ($throwOnError)
{
throw
}
}
finally
{
# Curious PS behavior: It appears that 'return' trumps 'throw', so don't return...
if (!$throwOnError -or $success)
{
return $success
}
}
}
}
# Sleep for the specified duration (in seconds)
function Sleep-ForSeconds ( [int]$sleepSecs )
{
Log-Verbose "Sleeping for $sleepSecs seconds..."
Start-Sleep $sleepSecs
}
# Common function to retrieve the current UTC time string
function Get-CurrentTimeString
{
return [DateTime]::UtcNow.ToString("[HH:mm:ss.fff UTC]")
}
# Common function for verbose logging
function Log-Verbose ( [string]$msg )
{
$timeStamp = Get-CurrentTimeString
Write-Verbose "$timeStamp $msg"
}
# Common function for warning logging
function Log-Warning ( [string]$msg )
{
$timeStamp = Get-CurrentTimeString
Write-Warning "$timeStamp $msg"
}
# Common function for error logging
function Log-Error ( [string]$msg, [switch]$Stop)
{
$timeStamp = Get-CurrentTimeString
if (!$Stop)
{
Write-Error "$timeStamp $msg"
}
else
{
Write-Error "$timeStamp $msg" -ErrorAction:Stop
}
}
# Common function for logging an error, given an ErrorRecord
function Log-ErrorRecord( [System.Management.Automation.ErrorRecord] $errRecord, [switch]$Stop )
{
# Trim the message so it will not display the "ErrorActionPreference is set to Stop" message
#
$failedMessage = $errRecord.ToString()
if ($failedMessage.IndexOf("ErrorActionPreference") -ne -1)
{
$failedMessage = $failedMessage.Substring($failedMessage.IndexOf("set to Stop: ") + 13)
}
$failedMessage = $failedMessage -replace "`r"
$failedMessage = $failedMessage -replace "`n"
$failedCommand = $errRecord.InvocationInfo.MyCommand
Log-Error "Failed at command '$failedCommand' with '$failedMessage'" -Stop:$Stop
}
# Shuffles objects coming from the input pipeline.
# NOTE: This method only works when invoked via a pipeline.
function Shuffle-Objects(
[Parameter(Mandatory=$true,ValueFromPipeline=$true)] $inputData
)
{
Begin
{
$inputDataList = @()
}
Process
{
# build the input list first
$inputDataList += $inputData
}
End
{
# now shuffle the contents of the input list
$len = $inputDataList.Length
for ([int] $i = 0; $i -lt $len; $i++)
{
# pick the next random number
[int]$randomIndex = Get-Random -Minimum:$i -Maximum:$len
# swap the values
$temp = $inputDataList[$i]
$inputDataList[$i] = $inputDataList[$randomIndex]
$inputDataList[$randomIndex] = $temp
}
foreach ($element in $inputDataList)
{
# send each element to the output pipeline
$element
}
}
}
# This will send a mail message to the specified recipients.
#
# Based on Send-Mail function from DatacenterHealthCommonLibrary.ps1 (service engineering scripts).
#
# Here we will create two SMTP clients: (1) datacenter client is configured to send mails from
# production datacenter environment (based on Send-Mail function from DatacenterHealthCommonLibrary.ps1),
# (2) CorpNet client is configured to send mails from Topobuilder machines inside a CORPNET.
#
function Send-HANotificationMail(
[string]$title,
[string]$body,
[string[]]$attachments,
[string]$from,
[string[]]$tos,
[string[]]$ccs,
[string]$pri = "Normal",
[int]$maxRetryAttempts = 2)
{
if ($script:IsDataCenterLibraryPresent)
{
Log-Verbose "Calling DataCenter send-mail function..."
Set-StrictMode -Off
send-mail -title:$title -body:$body -from:$from -tos:$tos -ccs:$ccs -attachments:$attachments -pri:$pri
Set-StrictMode -Version 2.0
return
}
else
{
Log-Verbose "Calling Send-HANotificationMailCorpHub..."
[bool]$sent = Send-HANotificationMailCorpHub -title:$title -body:$body -attachments:$attachments -from:$from `
-tos:$tos -ccs:$ccs -pri:$pri -maxRetryAttempts:$maxRetryAttempts
if (!$sent){
Log-Error "Send mail failed!"
}
}
}
function Get-HubServers
{
Get-ExchangeServer | where { $_.IsHubTransportServer }
}
# Build a list of SMTP clients that can send mail to a local hub server.
#
function Build-HubSmtpClients
{
#FUTURE: We should return a list of hostnames and port and let the caller iterate over them if failure...
# Also, try to choose a server in the same site as we are running
# In production we will use the "send-mail" function provided by the svc engineering team
$hubServers = (Get-HubServers | Shuffle-Objects )
if (!$hubServers) {
Log-Error "No Hub Server found!"
return
}
foreach ($hubServer in @($hubServers))
{
$smtpClient = New-Object System.Net.Mail.SmtpClient($hubServer.Fqdn)
$smtpClient.UseDefaultCredentials = $true
Write-Output $smtpClient
}
}
function Get-SmtpClients ()
{
Log-Verbose "Entering Get-SmtpClients:"
$clients = @()
$clients += Build-HubSmtpClients
return $clients
}
# Build a Mail message
#
function Build-MailMsg(
[string]$title,
[string]$body,
[string[]]$attachments,
[string]$from,
[string[]]$tos,
[string[]]$ccs,
[string]$pri = "Normal")
{
$mailMessage = New-Object System.Net.Mail.MailMessage
$mailMessage.Body = $body
$mailMessage.Priority = $pri
$mailMessage.Subject = $title
$mailMessage.From = New-Object System.Net.Mail.MailAddress($from);
# Add attachments
if ($attachments)
{
foreach ($attachment in @($attachments))
{
if ( Test-Path $attachment )
{
$data = New-Object System.Net.Mail.Attachment -ArgumentList $attachment, 'Application/Octet'
[void]$mailMessage.Attachments.Add($data);
}
}
}
foreach ($to in @($tos))
{
[void]$mailMessage.To.Add($to)
}
if ($ccs)
{
foreach ($cc in @($ccs))
{
[void]$mailMessage.CC.Add($cc)
}
}
return $mailMessage
}
# Send an email message
# Return $true if an SMTP host was contacted and the mail transmitted.
# There is no guarantee that the mail will get through.
# This is cloned from Send-NotificationMail in DatacenterSvcEngCommonLibrary.ps1, but made simpler
# so I could add function and simplify it at the same time.
#
function Send-HANotificationMailCorpHub(
[string]$title,
[string]$body,
[string[]]$attachments,
[string]$from,
[string[]]$tos,
[string[]]$ccs,
[string]$pri = "Normal",
[int]$maxRetryAttempts = 2)
{
Log-Verbose "Entering Send-HANotificationMailCorpHub: `$from=$from, `$pri=$pri"
$clients = @(Get-SmtpClients)
if (!$clients)
{
Log-Error "Get-SmtpClients failed!"
return $false
}
$mailMessage = Build-MailMsg -title $title -body $body -attachments $attachments `
-from $from -tos $tos -ccs $ccs -pri $pri
if (!$mailMessage) {
Log-Error "Build-MailMsg failed!"
return $false
}
try
{
foreach ($smtpClient in $clients)
{
# Change the timeout for synchronous Send() call to 30 secs
$smtpClient.Timeout = 30000
$retries = 0
Log-Verbose "Sending notification mail to: $([string]::Join(';',$tos))"
Log-Verbose "Using SMTP client for '$($smtpClient.Host)', port=$($smtpClient.Port)"
Log-Verbose "Sending mail from '$from'..."
do {
try {
$success = $true
$smtpClient.Send($mailMessage)
Log-Verbose "Mail sent!"
return $true
} catch {
$success = $false
$retries++
if ($retries -eq $maxRetryAttempts) {
Log-Verbose "Exceeded $maxRetryAttempts retries sending mail to $tos."
} else {
Log-Verbose "Retrying to send mail to $tos."
}
}
} while ((-not $success) -and $retries -lt $maxRetryAttempts)
Log-Verbose "Send failed. Trying to use a different smtp client if possible..."
}
}
finally
{
$mailMessage.Dispose()
}
return $success
}
function Append-RedundancyInformation(
[Parameter(Mandatory=$true,ValueFromPipeline=$true)] [CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbState)
{
Process
{
$totalMins = $dbState.TotalRedDuration.TotalMinutes.ToString("F2")
$msg = "
Database : $($dbState.DatabaseName)
Redundancy Count : $($dbState.CurrentRedundancyCount)
Total Red Minutes : $totalMins"
$script:report.AppendLine($msg) | Out-Null
foreach ($errRecord in @($dbState.ErrorHistory))
{
$timeStr = $errRecord.ErrorTime.ToString("HH:mm:ss.fff UTC")
$msg = "
$timeStr :
$($errRecord.GetErrorStringForAlerting())
"
$script:report.AppendLine($msg) | Out-Null
}
}
}
function Send-SummaryEmail
{
if (!$SendSummaryMailTos)
{
return
}
[System.Text.StringBuilder]$script:report = New-Object -TypeName System.Text.StringBuilder -ArgumentList 2048
$states = $script:databaseStateTable.Values
$dbsWithOneCopy = $states | where { `
($_.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Red) -and `
($_.TotalRedDuration.TotalSeconds -gt $ReportRedEventAfterDurationSecs) `
} | sort -Property DatabaseName
$dbsWithErrors = $states | where { `
$_.HasErrorsInHistory -and `
( ($_.CurrentState -ne [CheckHADatabaseRedundancy.AlertState]::Red) -or `
($_.TotalRedDuration.TotalSeconds -le $ReportRedEventAfterDurationSecs) `
)} | sort -Property DatabaseName
[int]$databasesCount = ($states | Measure-Object).Count
[int]$dbsWithErrorsCount = ($dbsWithErrors | Measure-Object).Count
[int]$dbsOneCopyCount = ($dbsWithOneCopy | Measure-Object).Count
if ( ($dbsWithErrorsCount -eq 0) -and ($dbsOneCopyCount -eq 0) )
{
# No need to send an email to report that everything is healthy
Log-Verbose "Skipping sending an email report since everything is healthy."
return
}
[string]$priority = "Normal"
$dbsWithOneCopyNames = ( $dbsWithOneCopy | select -ExpandProperty DatabaseName )
$dbsWithErrorsNames = ( $dbsWithErrors | select -ExpandProperty DatabaseName )
[string]$dbsWithOneCopyNamesStr = $null
[string]$dbsWithErrorsNamesStr = $null
if ($dbsWithOneCopyNames)
{
$dbsWithOneCopyNamesStr = [string]::Join(", ", $dbsWithOneCopyNames)
}
if ($dbsWithErrorsNames)
{
$dbsWithErrorsNamesStr = [string]::Join(", ", $dbsWithErrorsNames)
}
[string] $msg = "
***************************************
Database Redundancy Report
$((Get-Date).DateTime)
***************************************"
$script:report.AppendLine($msg) | Out-Null
if ($PSCmdlet.ParameterSetName -eq "Database" )
{
$msg = "Database : $MailboxDatabaseName"
$script:report.AppendLine($msg) | Out-Null
}
elseif ($PSCmdlet.ParameterSetName -eq "Server" )
{
$msg = "Server : $MailboxServerName"
$script:report.AppendLine($msg) | Out-Null
}
$msg = `
"DatabaseCount : $databasesCount
DbsCountWithLowRedundancy : $dbsOneCopyCount
DbsCountWithErrors : $dbsWithErrorsCount
DatabasesWithLowRedundancy : $dbsWithOneCopyNamesStr
DatabasesWithErrors : $dbsWithErrorsNamesStr"
$script:report.AppendLine($msg) | Out-Null
if ($dbsOneCopyCount -gt 0)
{
# mark email as urgent
$priority = "High"
$msg = "
=================================================================
Databases with low redundancy ( < $AtLeastNCopies copies)
================================================================="
$script:report.AppendLine($msg) | Out-Null
@($dbsWithOneCopy) | Append-RedundancyInformation
}
if ($dbsWithErrorsCount -gt 0)
{
$msg = "
=================================================================
Databases with errors ( >= $AtLeastNCopies copies)
================================================================="
$script:report.AppendLine($msg) | Out-Null
@($dbsWithErrors) | Append-RedundancyInformation
}
# Create the email subject
[string]$title = "DB Redundancy: "
if ($script:dagName)
{
$title += "$($script:dagName): "
}
if ($PSCmdlet.ParameterSetName -eq "Database" )
{
$title += "$MailboxDatabaseName - "
}
elseif ($PSCmdlet.ParameterSetName -eq "Server" )
{
$title += "$MailboxServerName - "
}
if ($dbsOneCopyCount -gt 0)
{
$durationMins = [TimeSpan]::FromSeconds($ReportRedEventAfterDurationSecs).TotalMinutes
if ($dbsOneCopyCount -eq 1)
{
$title += "1 DB has less than $AtLeastNCopies copies for more than $durationMins mins"
}
else
{
$title += "$dbsOneCopyCount DBs have less than $AtLeastNCopies copies for more than $durationMins mins"
}
}
elseif ($dbsWithErrorsCount -gt 0)
{
if ($dbsWithErrorsCount -eq 1)
{
$title += "1 DB has had errors in the past hour"
}
else
{
$title += "$dbsWithErrorsCount DBs have had errors in the past hour"
}
}
else
{
$title += "All DBs have been sufficiently redundant for the past hour"
}
# send the email
Send-HANotificationMail -title:$title -body:($script:report.ToString()) -from:$SummaryMailFrom -tos:$SendSummaryMailTos -pri:$priority
}
###################################################################
### Entry point for the script itself
###################################################################
function RunOnce
{
$script:outputObjects = $null
# Run each iteration of the check in a separate script block so that any errors
# can be trapped and the entire script block exits.
$checkCmd = { $script:outputObjects = RunOnceInternal }
[bool]$success = TryExecute-ScriptBlock -runCommand $checkCmd
if ($success)
{
Log-Verbose "Iteration $($script:iteration) of the monitoring check completed successfully."
}
else
{
Log-Error "Iteration $($script:iteration) of the monitoring check FAILED due to an error."
}
# send to the output pipeline
$script:outputObjects
}
function RunOnceInternal
{
$script:oneIterationStopwatch.Reset()
$script:oneIterationStopwatch.Start()
$script:iteration++
Log-Verbose "Starting iteration $($script:iteration) of the monitoring check..."
# The databases being monitored
[Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase[]] $mdbs = @()
# Lookup the specified database
if ($PSCmdlet.ParameterSetName -eq "Database" )
{
$mdb = Get-MailboxDatabase $MailboxDatabaseName
if ($mdb)
{
if ($mdb.Recovery)
{
Log-Error "Database '$mdb' is a recovery database. Please specify a non-recovery database to check the health of." -Stop
return
}
# We will check databases even if they only have 1 configured copy
$mdbs += $mdb
}
else
{
Log-Verbose "Could not find database matching '$MailboxDatabaseName'."
}
}
# Lookup all databases on the specified server
elseif ($PSCmdlet.ParameterSetName -eq "Server" )
{
$server = Get-MailboxServer $MailboxServerName
if ($server)
{
Log-Verbose "Found mailbox server '$MailboxServerName'."
$script:mailboxServer = $server
$allMdbs = @( Get-MailboxDatabase -Server $server )
if ($SkipDatabasesRegex)
{
# filter out the DBs matching the regex specified
Log-Verbose "Filtering out databases matching the following regex: '$SkipDatabasesRegex'"
$mdbs = @( $allMdbs | where { ($_.Name -inotmatch $SkipDatabasesRegex) -and (!$_.Recovery) } )
}
else
{
# no database name filter specified, so check against all
$mdbs = @( $allMdbs | where { !$_.Recovery } )
}
}
else
{
Log-Verbose "Could not find server matching '$MailboxServerName'."
}
}
Log-Verbose "Found $($mdbs.Length) databases..."
# perform the check
if ($mdbs.Length -gt 0)
{
Check-Databases $mdbs $PSCmdlet.ParameterSetName
}
else
{
Log-Verbose "Skipping Check-Databases since there are no databases to check!"
}
$script:oneIterationStopwatch.Stop()
Log-Verbose "Iteration $($script:iteration) of the monitoring check completed in $($script:oneIterationStopwatch.Elapsed.TotalMilliseconds) ms"
}
# This function returns true if you can remove the copy without losing redundancy and false otherwise
function Check-DatabaseRedundancyForCopyRemoval(
[string] $databaseName = $(throw "Check-DatabaseRedundancyForCopyRemoval: databaseName is required."),
[string] $serverName = $(throw "Check-DatabaseRedundancyForCopyRemoval: serverName is required."))
{
if ( -not $databaseName ) { throw "Check-DatabaseRedundancyForCopyRemoval: databaseName cannot be empty." }
if ( -not $serverName ) { throw "Check-DatabaseRedundancyForCopyRemoval: serverName cannot be empty." }
$MailboxDatabaseName = $databaseName
$SkipEventLogging = $true
$MonitoringContext = $false
$databases = @( Get-MailboxDatabase $MailboxDatabaseName )
Populate-DatabasesTable $databases
# find the servers to check copy statuses on
[String[]]$servers = Get-ServersForDatabases $databases
if ($servers.Length -lt 2)
{
# Normally we should not get here, since we're only checking replicated DBs, which means
# we should have at least 2 distinct servers. However, this can happen if copies are
# removed while this script is running.
Log-Warning "Check-DatabaseRedundancyForCopyRemoval: Get-ServersForDatabases returned only '$($servers.Length)' servers."
}
# get the status results and index them by database name
$script:databaseToStatusTable.Clear()
$script:databaseToStatusTable = Get-CopyStatusFromAllServers $servers "Database" | `
Group-Object -AsHashTable -Property DatabaseName
# look up the cluster node status for the DAG
Populate-ClusterNodeStatus
# Simulate copy removal
[UInt32] $regularAtLeastNCopies = $AtLeastNCopies
[bool] $foundCopy = $false
for ($i = 0; $i -lt $script:databaseToStatusTable[$MailboxDatabaseName].Count; $i++)
{
if ( $script:databaseToStatusTable[$MailboxDatabaseName][$i].Name -eq "$MailboxDatabaseName\$serverName" )
{
if ( $script:databaseToStatusTable[$MailboxDatabaseName][$i].ActiveCopy )
{
$AtLeastNCopies = $regularAtLeastNCopies + 1
Log-Verbose "Check-DatabaseRedundancyForCopyRemoval: Active copy $MailboxDatabaseName\$serverName will be removed. Redundancy count should be at least $AtLeastNCopies for active copy removal."
}
else
{
Log-Verbose "Check-DatabaseRedundancyForCopyRemoval: Passive copy $MailboxDatabaseName\$serverName will be removed."
$script:databaseToStatusTable[$MailboxDatabaseName].RemoveAt($i)
}
$foundCopy = $true
break
}
}
if ( -not $foundCopy )
{
throw "Check-DatabaseRedundancyForCopyRemoval: Copy $MailboxDatabaseName\$serverName was not found."
}
$status = Check-DatabaseRedundancy $MailboxDatabaseName
[bool] $result = $status.CurrentState -eq "Green"
$AtLeastNCopies = $regularAtLeastNCopies
return $result
}
function Main
{
if ($PSCmdlet.ParameterSetName -eq "DotSourceMode")
{
Log-Verbose "Script run with -DotSourceMode. Exiting."
return
}
# Ensure this table is cleared at script startup
# Other hashtables get cleared every iteration of RunOnce.
$script:databaseStateTable.Clear()
# Validate the email parameters
if ($SendSummaryMailTos -and !$SummaryMailFrom)
{
Log-Error "Please specify a -SummaryMailFrom address as well when -SendSummaryMailTos is used."
# Let monitoring continue anyway...
}
if (!$MonitoringContext)
{
Log-Verbose "Running once."
RunOnce
Send-SummaryEmail
return
}
# We are in the MonitoringContext
[bool] $keepRunning = $true
[System.Diagnostics.Stopwatch] $overallScriptStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch
$overallScriptStopwatch.Reset()
$overallScriptStopwatch.Start()
Log-Verbose "Running many times..."
while ($keepRunning)
{
RunOnce
# decide if we should run the next iteration
if (($TerminateAfterDurationSecs -eq -1) -or `
($TerminateAfterDurationSecs -eq 0))
{
# infinite duration specified
$keepRunning = $true
}
else
{
[double]$lastIterationMsecs = $script:oneIterationStopwatch.Elapsed.TotalMilliseconds
[double]$timeLeftMsecs = [double]($TerminateAfterDurationSecs * 1000) - $overallScriptStopwatch.Elapsed.TotalMilliseconds
# Is there enough time left for a (sleep + RunOnce) ?
if ( ([double]($SleepDurationBetweenIterationsSecs * 1000) + $lastIterationMsecs) -lt $timeLeftMsecs )
{
$keepRunning = $true
}
else
{
$keepRunning = $false
break
}
}
Sleep-ForSeconds $SleepDurationBetweenIterationsSecs
}
Send-SummaryEmail
}
$Command = $MyInvocation.MyCommand
Log-Verbose "Starting: $($Command.Path)"
# The command below is useful to see what parameters are defined in this script cmdlet.
# $Command | fl Path, CommandType, Parameters, ParameterSets
# Get the code compilation out of the way
Prepare-DatabaseRedundancyEntryDefinition
LoadExchangeSnapin
# In datacenter configurations we can use libraries provided by service engineering
$InstallPath = (Get-ItemProperty -path 'HKLM:SOFTWARE\Microsoft\ExchangeServer\v14\Setup').MsiInstallPath.Trim().TrimEnd("\")
$DatacenterLibraryPath = "$InstallPath\DataCenter"
$SvcLibaryFileName = "DatacenterHealthCommonLibrary.ps1"
$ServiceCommonLib = "$DatacenterLibraryPath\$SvcLibaryFileName"
$script:IsDataCenterLibraryPresent = Test-Path $ServiceCommonLib
if ($script:IsDataCenterLibraryPresent)
{
if ($SendSummaryMailTos)
{
# Get a send-mail function
Log-Verbose "Loading DataCenter script library '$ServiceCommonLib'"
# The common lib doesn't use clean practices so we have to avoid strict mode
Set-StrictMode -Off
. $ServiceCommonLib
Set-StrictMode -Version 2.0
}
}
else
{
Log-Verbose "File '$DatacenterLibraryPath\$SvcLibaryFileName' is not present, so skipping sending a mail."
}
Main
# .SYNOPSIS # Checks the redundancy of databases by validating that they have at least N # configured and "healthy" copies. Active and passive copies are both counted. # .DESCRIPTION # # Copyright (c) 2010 Microsoft Corporation. All rights reserved. # # THIS CODE IS MADE AVAILABLE AS IS, WITHOUT WARRANTY OF ANY KIND. THE ENTIRE RISK # OF THE USE OR THE RESULTS FROM THE USE OF THIS CODE REMAINS WITH THE USER. # To use this script you need to provide either $MailboxDatabaseName or $MailboxServerName. # To generate events for Monitoring, you need to provide -MonitoringContext switch. [CmdletBinding(DefaultParametersetName="Server")] param( [Parameter(ParameterSetName="Database",Mandatory=$true,Position=0)] [string] $MailboxDatabaseName, # By default, check against the local server [Parameter(ParameterSetName="Server",Position=0)] [string] $MailboxServerName = $env:COMPUTERNAME, # Skip checking the "default" mailbox databases. eg: Mailbox Database 0017891750 # Specify $null (or an empty string) if you don't want to skip any databases. [Parameter(ParameterSetName="Server")] [string] $SkipDatabasesRegex = "^Mailbox Database \d{10}$", [Parameter(ParameterSetName="Monitoring",Mandatory=$true)] [Parameter(ParameterSetName="Database")] [Parameter(ParameterSetName="Server")] [switch] $MonitoringContext = $false, [Parameter(ParameterSetName="Monitoring")] [Parameter(ParameterSetName="Database")] [Parameter(ParameterSetName="Server")] [UInt32] $SleepDurationBetweenIterationsSecs = 60, [Parameter(ParameterSetName="Monitoring")] [Parameter(ParameterSetName="Database")] [Parameter(ParameterSetName="Server")] [Int32] $TerminateAfterDurationSecs = 3480, # 58 minutes; -1,0 are "Infinite" [Parameter(ParameterSetName="Monitoring")] [Parameter(ParameterSetName="Database")] [Parameter(ParameterSetName="Server")] [UInt32] $SuppressGreenEventForSecs = 600, # 10 minutes # If the total duration of being "red" exceeds this amount, raise the Red event [Parameter(ParameterSetName="Monitoring")] [Parameter(ParameterSetName="Database")] [Parameter(ParameterSetName="Server")] [UInt32] $ReportRedEventAfterDurationSecs = 1200, # 20 minutes # Once we raise a red event, report it periodically every $ReportRedEventIntervalSecs seconds. [Parameter(ParameterSetName="Monitoring")] [Parameter(ParameterSetName="Database")] [Parameter(ParameterSetName="Server")] [UInt32] $ReportRedEventIntervalSecs = 900, # 15 minutes [Parameter(ParameterSetName="Monitoring")] [Parameter(ParameterSetName="Database")] [Parameter(ParameterSetName="Server")] [switch] $SkipEventLogging = $false, [UInt32] $AtLeastNCopies = 2, # If false, detailed summary status is left out of the events/objects reported [switch] $ShowDetailedErrors = $false, # The email FROM address to use for the summary report [string] $SummaryMailFrom = $null, # Send a summary report email to the following addresses [string[]] $SendSummaryMailTos = $null, # Useful to "dot-source" this script as a library - call the script as such: # PS D:\Exchange Mailbox\v14\Scripts> . .\CheckDatabaseRedundancy.ps1 -DotSourceMode [Parameter(ParameterSetName="DotSourceMode",Mandatory=$true)] [switch] $DotSourceMode = $false ) Set-StrictMode -Version 2.0 function LoadExchangeSnapin { if (! (Get-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010 -ErrorAction:SilentlyContinue) ) { Add-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010 } } LoadExchangeSnapin #--------------------------------------- # Aliases for commonly used enum types # #--------------------------------------- $CopyStatusType = [Microsoft.Exchange.Management.SystemConfigurationTasks.CopyStatus] $ReplicationTypeType = [Microsoft.Exchange.Data.Directory.SystemConfiguration.ReplicationType] $MountDialType = [Microsoft.Exchange.Data.Directory.SystemConfiguration.AutoDatabaseMountDial] #------------ # Constants # #------------ # This is the maximum copy queue length considered "healthy" for a passive copy. # Currently, this value is 12 as defined by BestAvailability. $CopyQueueLengthThreshold = [int]$MountDialType::BestAvailability $InspectorQueueLengthWarningThreshold = $CopyQueueLengthThreshold $InspectorQueueLengthFailedThreshold = 1000 $ReplayQueueLengthWarningThreshold = 500 #------------------- # Script variables # #------------------- [System.Diagnostics.Stopwatch] $script:copyStatusStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch [System.Diagnostics.Stopwatch] $script:copyStatusAllStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch [System.Diagnostics.Stopwatch] $script:clusterNodeStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch [System.Diagnostics.Stopwatch] $script:clusterNodeOverallStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch [System.Diagnostics.Stopwatch] $script:oneIterationStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch $script:databaseToStatusTable = @{} # Hashtable indexed by DatabaseName, value of Collection<DatabaseCopyStatusEntry> $script:databasesToCheckTable = @{} # Hashtable for the databases that we want to check, indexed by DatabaseName $script:databaseStateTable = @{} # Hashtable indexed by DatabaseName, that holds the redundancy state of each DB $script:clusterNodeStateTable = @{} #Hashtable indexed by server name, that holds the cluster node state (Up, Down, Joining, Paused, Unknown) $script:outputObjects = @() # List of objects to send to the output pipeline [Microsoft.Exchange.Data.Directory.Management.MailboxServer] $script:mailboxServer = $null [UInt64] $script:iteration = 0 $script:clusterOutput = $null [string]$script:dagName = $null [System.Text.StringBuilder]$script:report = $null $script:IsDataCenterLibraryPresent = $false function Is-DatabaseReplicated ([Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase] $mdb) { if ($mdb.ReplicationType -eq $ReplicationTypeType::Remote) { return $true; } return $false; } function Is-DagServerOnline ([string] $serverName) { Log-Verbose "Is-DagServerOnline: Entering: `$serverName=$serverName" [bool]$isOnline = $false # This is locale-dependent. if ($script:clusterNodeStateTable.Contains($serverName) -and $script:clusterNodeStateTable[$serverName] -ieq "Up") { $isOnline = $true } Log-Verbose "Is-DagServerOnline: Leaving (returning '$isOnline')" return $isOnline } # The following states are possibly healthy for a passive copy: # Healthy, DisconnectedAndHealthy, SeedingSource function Is-PassiveCopyPossiblyHealthy ([Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $status) { $healthy = $false; switch ($status.Status) { $CopyStatusType::Healthy { $healthy = $true } $CopyStatusType::DisconnectedAndHealthy { $healthy = $true } $CopyStatusType::SeedingSource { $healthy = $true } default { } } return $healthy } function Is-ActiveReplayServiceDown ([string] $databaseName) { Log-Verbose "Is-ActiveReplayServiceDown: '$databaseName': Entering..." [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @() $statuses = $script:databaseToStatusTable[$databaseName] # guarantees that $activeStatus is a single entry, instead of a collection $activeStatus = $statuses | where { $_.ActiveCopy } | select -First 1 if (!$activeStatus) { Log-Verbose "Is-ActiveReplayServiceDown: '$databaseName': No ActiveCopy found! Assuming it has replay service down." return $true } if ($activeStatus.Status -eq $CopyStatusType::ServiceDown) { Log-Verbose "Is-ActiveReplayServiceDown: Active copy '$($activeStatus.Name)' has replay service down." return $true } Log-Verbose "Is-ActiveReplayServiceDown: '$databaseName': Leaving, returning 'False'" return $false } function Populate-DatabasesTable ([Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase[]] $databases) { $script:databasesToCheckTable.Clear() Foreach ($database in $databases) { $script:databasesToCheckTable[$database.Name] = $database } } # Queries the cluster remotely for the node status, using the server name specified. # Returns $true if the cluster.exe command completed successfully; $false otherwise. # Additionally, if $true is being returned, $clusterNodeStateTable is updated. function Get-ClusterNodeStatus ([string] $clusterName) { $script:clusterNodeStateTable.Clear() $script:clusterNodeStopwatch.Reset(); $script:clusterNodeStopwatch.Start(); [bool]$success = $false $script:clusterOutput = $null # Run the cluster command, instead of relying on "get-DAG -Status" since that makes # a replay RPC and the replay service could be down. # NOTE: This command can take a while (30-45 secs) when the quorum is lost, or netname is down. Log-Verbose "Get-ClusterNodeStatus: Running command: cluster.exe /cluster:$clusterName node" $clusCommand = { # The '2>&1' redirects errors to the success stream. See 'help about_redirection' for more info. $script:clusterOutput = ( ( cluster.exe /cluster:$clusterName node ) 2>&1 ) } $dummy = TryExecute-ScriptBlock -runCommand $clusCommand -silentOnErrors $true if ( $LastExitCode -eq 1722 ) { Log-Verbose "Get-ClusterNodeStatus: cluster.exe failed to contact the cluster. RPC_S_SERVER_UNAVAILABLE. The cluster netname or node may be down, or quorum lost." } elseif ( $LastExitCode -ne 0 ) { Log-Verbose "Get-ClusterNodeStatus: cluster.exe did not succced. Return value $LastExitCode. `nOutput: $($script:clusterOutput)" } else { $success = $true } if ($success) { Log-Verbose "Get-ClusterNodeStatus: cluster.exe returned the following output:`n $($script:clusterOutput)"; # Sample output: # # Listing status for all available nodes: # # Node Node ID Status # -------------- ------- --------------------- # EXCH-I-782 1 Up # EXCH-D-770 2 Up # EXCH-D-772 3 Down $match = select-string -pattern "(?<server>\S+)\s+\d+\s+(?<state>\S+)" -inputobject ($script:clusterOutput) -allmatches if ( $match -ne $null ) { $match.Matches | ` foreach { $script:clusterNodeStateTable.Add( $_.Groups["server"].Value.Trim(), $_.Groups["state"].Value.Trim() ) } } else { $success = $false Log-Verbose "Get-ClusterNodeStatus: cluster.exe output returned no regex matches!" } } $script:clusterNodeStopwatch.Stop() Log-Verbose "Get-ClusterNodeStatus: cluster.exe operation completed in $($script:clusterNodeStopwatch.Elapsed.TotalMilliseconds) ms. Returning '$success'." return $success } # Queries the cluster node states for all cluster servers via the cluster group name (which is the DAG name). # It then parses the output into a hashtable indexed by server name, with the cluster node state for each (i.e. Up, Down, etc.) function Populate-ClusterNodeStatus { $script:dagName = $null $script:clusterNodeStateTable.Clear() # Find the DAG name first. if ($script:mailboxServer) { $script:dagName = $script:mailboxServer.DatabaseAvailabilityGroup.Name } else { # running in DB mode, which means there should only be one DB in this table $db = $script:databasesToCheckTable.Values | select -First 1 $script:dagName = $db.MasterServerOrAvailabilityGroup.Name } Log-Verbose "Populate-ClusterNodeStatus: Found DAG '$($script:dagName)'." $script:clusterNodeOverallStopwatch.Reset(); $script:clusterNodeOverallStopwatch.Start(); # First, try with the cluster netname (which is the DAG name) [bool]$success = Get-ClusterNodeStatus $script:dagName [Microsoft.Exchange.Data.Directory.SystemConfiguration.DatabaseAvailabilityGroup]$dag = $null if (!$success) { Log-Verbose "Populate-ClusterNodeStatus: Failed to query the cluster using the cluster netname of '$($script:dagName)'! Querying DAG member servers instead." $dag = Get-DatabaseAvailabilityGroup $script:dagName if ($dag) { if (!$dag.Servers -or ` ($dag.Servers.Count -eq 0)) { Log-Verbose "Populate-ClusterNodeStatus: DAG '$($script:dagName)' contains no servers!" } else { # Pick a random server to start querying [int]$index = [int]([System.Environment]::TickCount % $dag.Servers.Count) Log-Verbose "Populate-ClusterNodeStatus: Enumerating DAG servers starting at index='$index', server='$($dag.Servers[$index])'." # $REVIEW: If the below syntax is used, 'break' doesn't exit out of the 'foreach'. # Instead, in this case, break exits out of the entire script! Scary... I'll stick to for loops for now. # 1..$dag.Servers.Count | foreach ` for ($iteration = 1; $iteration -le $dag.Servers.Count; $iteration++) { $serverName = $dag.Servers[$index] $success = Get-ClusterNodeStatus $serverName if ($success) { break } else { $index = ($index + 1) % $dag.Servers.Count } } } } else { Log-Verbose "Populate-ClusterNodeStatus: Could not find DAG '$($script:dagName)'! Marking all nodes as 'Down'." } } if (!$success) { # Mark all the DAG members as being down. # In case $dag is null, Is-DagServerOnline will return $false since the table should be empty if ($dag) { $dag.Servers | foreach { $script:clusterNodeStateTable.Add( $_.Name, "Down" ) } } } $script:clusterNodeOverallStopwatch.Stop() Log-Verbose "Populate-ClusterNodeStatus: Overall operation completed in $($script:clusterNodeOverallStopwatch.Elapsed.TotalMilliseconds) ms." } function Check-Databases ([Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase[]] $databases, [string] $ParameterSetName) { Populate-DatabasesTable $databases # find the servers to check copy statuses on [String[]]$servers = Get-ServersForDatabases $databases if ($servers.Length -lt 2) { # Normally we should not get here, since we're only checking replicated DBs, which means # we should have at least 2 distinct servers. However, this can happen if copies are # removed while this script is running. Log-Warning "Get-ServersForDatabases returned only '$($servers.Length)' servers." } # get the status results and index them by database name $script:databaseToStatusTable.Clear() $script:databaseToStatusTable = Get-CopyStatusFromAllServers $servers $ParameterSetName | ` Group-Object -AsHashTable -Property DatabaseName # look up the cluster node status for the DAG Populate-ClusterNodeStatus Log-Verbose "Check-Databases: Filtering out databases we are not going to check..." # Filter out the databases we are not going to check, and then perform the redundancy check. $script:databaseToStatusTable.Keys | ` where { $script:databasesToCheckTable.Contains( $_ ) } | ` foreach { Check-DatabaseRedundancy $_ } # NOTE: # If the DB is completely removed from AD while this script is running, we may keep reporting # a Red alert for it. If need be, we can log a green event for the DB in this case... } # This object represents a DB's redundancy state. It is initialized once at script startup and # is subsequently maintained over multiple passes of Check-Databases. function CreateEmptyDatabaseRedundancyEntry { [CheckHADatabaseRedundancy.DatabaseRedundancyEntry]$entry = New-Object -TypeName "CheckHADatabaseRedundancy.DatabaseRedundancyEntry" return $entry } function Initialize-DatabaseRedundancyEntry ([CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbRedundancy) { $dbRedundancy.LastRedundancyCount = $dbRedundancy.CurrentRedundancyCount $dbRedundancy.LastState = $dbRedundancy.CurrentState $dbRedundancy.CurrentRedundancyCount = 0 $dbRedundancy.CurrentState = "Unknown" $dbRedundancy.CurrentErrorMessages = $null } function Get-SummaryCopyStatusString( [Parameter(Mandatory=$true,ValueFromPipeline=$true)] [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $status) { Begin { $statusOutputs = @() } Process { $statusOutput = $status | Select-Object *,` @{Name="RealCopyQueue"; Expression={ [Math]::Max(0, $_.LastLogGenerated - $_.LastLogCopied) }}, ` @{Name="InspectorQueue"; Expression={ [Math]::Max(0, $_.LastLogCopied - $_.LastLogInspected) }}, ` @{Name="ReplayQueue"; Expression={ $_.ReplayQueueLength }}, ` @{Name="CIState"; Expression={ $_.ContentIndexState }} $statusOutputs += $statusOutput } End { [string]$statusStr = ($statusOutputs | ft -Wrap Name,Status,RealCopyQueue,InspectorQueue,ReplayQueue,CIState | Out-String) $statusStr = $statusStr -replace "\s+$" # trim the white space at the end Write-Output $statusStr } } # Logic to decide if a DB has insufficient redundancy function Check-DatabaseRedundancy ([string] $dbName) { Log-Verbose "Check-DatabaseRedundancy: '$dbName': Entering..." # Initialize the DB redundancy state if necessary if (!$script:databaseStateTable.Contains($dbName)) { $dbState = CreateEmptyDatabaseRedundancyEntry $dbState.DatabaseName = $dbName $script:databaseStateTable.Add($dbName, $dbState) Log-Verbose "Check-DatabaseRedundancy: '$dbName': Created empty DB redundancy state entry." } # Retrieve the redundancy state object, and initialize states $dbRedundancy = $script:databaseStateTable[$dbName] Initialize-DatabaseRedundancyEntry $dbRedundancy # Get the list of copy status entries from the hashtable [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @() $statuses = $script:databaseToStatusTable[$dbName] [string[]] $tmpErrMessages = @() [string] $errMsg = $null [string] $summaryStatusStr = $null [CheckHADatabaseRedundancy.CopyCheckState]$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Unknown # In case there's only one configured copy, let's report that as an error if ($statuses.Count -lt $AtLeastNCopies) { $tmpErrMessages += "The number of configured copies for database '$dbName' ($($statuses.Count)) is less than the required redundancy count ($AtLeastNCopies)." } Foreach ($status in $statuses) { # Check the health of the active or passive copy ($errMsg,$checkState) = Get-DatabaseCopyHealth $status if ($checkState -eq [CheckHADatabaseRedundancy.CopyCheckState]::Passed) { $dbRedundancy.CurrentRedundancyCount++ } elseif ($checkState -eq [CheckHADatabaseRedundancy.CopyCheckState]::Warning) { $dbRedundancy.CurrentRedundancyCount++ $tmpErrMessages += $errMsg } else { # This copy has failed the check, so let's record the reason why $tmpErrMessages += $errMsg } } # If we've got some errors, remember them for emailing purposes if ($tmpErrMessages.Length -gt 0) { # Append the summary status $summaryStatusStr = ($statuses | Get-SummaryCopyStatusString) $tmpErrMessages += "$summaryStatusStr" # Add the overall errors to the history for this DB $dbRedundancy.AddErrorRecordToHistory( [DateTime]::UtcNow, $tmpErrMessages ) } if ($ShowDetailedErrors) { # Additionally, log the copy status output into the event $statusStr = $summaryStatusStr if (!$statusStr) { $statusStr = ($statuses | Get-SummaryCopyStatusString) if ($statusStr) { $tmpErrMessages += "`n`n================`n Summary Status `n================`n`n$statusStr" } } $statusStr = ($statuses | fl | Out-String) if ($statusStr) { $tmpErrMessages += "`n`n===============`n Full Status `n===============`n`n$statusStr" } } if ($tmpErrMessages.Length -gt 0) { $dbRedundancy.CurrentErrorMessages = $tmpErrMessages } Log-Verbose "Check-DatabaseRedundancy: '$dbName': CurrentRedundancyCount=$($dbRedundancy.CurrentRedundancyCount), LastRedundancyCount=$($dbRedundancy.LastRedundancyCount)" # Decide if the state is Red or Green if ($dbRedundancy.CurrentRedundancyCount -lt $AtLeastNCopies) { Log-Verbose "Check-DatabaseRedundancy: '$dbName': Redundancy count is lower than specified threshold of '$AtLeastNCopies'. Setting the state to 'Red'." $dbRedundancy.CurrentState = [CheckHADatabaseRedundancy.AlertState]::Red } else { $dbRedundancy.CurrentState = [CheckHADatabaseRedundancy.AlertState]::Green } # record the state transition times [datetime]$nowUtc = [DateTime]::UtcNow if ($dbRedundancy.IsTransitioningState) { $dbRedundancy.LastStateTransitionUtc = $nowUtc if ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Green) { $dbRedundancy.LastGreenTransitionUtc = $nowUtc if ($dbRedundancy.LastRedTransitionUtc) { [TimeSpan]$prevTimeInRed = $dbRedundancy.LastGreenTransitionUtc.Subtract( $dbRedundancy.LastRedTransitionUtc ) $dbRedundancy.PreviousTotalRedDuration = $dbRedundancy.PreviousTotalRedDuration.Add( $prevTimeInRed ) } } elseif ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Red) { $dbRedundancy.LastRedTransitionUtc = $nowUtc } } # Report a red/green event if necessary (suppression may occur if MonitoringContext is specified) PossiblyReport-RedGreenStatus $dbRedundancy } # Reports Red/Green status via mail/event etc, taking into account whether or not # we are running in the MonitoringContext (which affects suppression) function PossiblyReport-RedGreenStatus ( [CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbRedundancy ) { if ($MonitoringContext) { # In the monitoring context, we should run the suppression logic if ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Green) { [int]$timeInGreenSecs = Get-ElapsedTimeInSeconds $dbRedundancy.LastGreenTransitionUtc if (($timeInGreenSecs -gt $SuppressGreenEventForSecs) -and ` ($dbRedundancy.LastGreenReportedUtc -eq $null)) { # Only log a green event once, or if it transitions into Green again Report-GreenStatus $dbRedundancy } } elseif ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Red) { # We need to log an event if the total duration of being in Red (including flickering states) # is larger than $ReportRedEventAfterDurationSecs. if ($dbRedundancy.TotalRedDuration.TotalSeconds -gt $ReportRedEventAfterDurationSecs) { # Reporting a red event for the first time if ($dbRedundancy.LastRedReportedUtc -eq $null) { Report-RedStatus $dbRedundancy } else { # Additionally, we need to log an event every $ReportRedEventIntervalSecs seconds # while the DB is in "Red". [int]$timeSinceLastRedEventSecs = Get-ElapsedTimeInSeconds $dbRedundancy.LastRedReportedUtc if ($timeSinceLastRedEventSecs -gt $ReportRedEventIntervalSecs) { Report-RedStatus $dbRedundancy } } } } } else { # No monitoring context, so no suppression if ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Green) { Report-GreenStatus $dbRedundancy } elseif ($dbRedundancy.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Red) { Report-RedStatus $dbRedundancy } } } # Reports Green status via mail/event as appropriate [no suppression]. function Report-GreenStatus ( [CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbRedundancy ) { $dbRedundancy.LastGreenReportedUtc = [DateTime]::UtcNow [string]$dbCopyForAlerting = Get-DatabaseCopyForAlerting $dbRedundancy.DatabaseName [bool]$writeOutput = $true Log-Verbose "Report-GreenStatus: Reporting a Green event for database copy '$dbCopyForAlerting'" if ($MonitoringContext) { if (!$SkipEventLogging) { # Write green event log into Application log $writeOutput = $false # MonitoringDatabaseRedundancyCheckPassed - EventId 4114 Write-HAAppLogInformationEvent "40041012" 1 @($dbCopyForAlerting, $dbRedundancy.CurrentRedundancyCount, $dbRedundancy.GetErrorStringForAlerting()) } } if ($writeOutput) { Write-Output $dbRedundancy } } # Reports Red status via mail/event as appropriate [no suppression]. function Report-RedStatus ( [CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbRedundancy ) { $dbRedundancy.LastRedReportedUtc = [DateTime]::UtcNow $dbRedundancy.LastGreenReportedUtc = $null [string]$dbCopyForAlerting = Get-DatabaseCopyForAlerting $dbRedundancy.DatabaseName [bool]$writeOutput = $true Log-Verbose "Report-RedStatus: Reporting a RED event for database copy '$dbCopyForAlerting'!" if ($MonitoringContext) { if (!$SkipEventLogging) { # Write red event log into Application log $writeOutput = $false # MonitoringDatabaseRedundancyCheckFailed - EventId 4113 Write-HAAppLogErrorEvent "C0041011" 1 @($dbCopyForAlerting, $dbRedundancy.CurrentRedundancyCount, $dbRedundancy.GetErrorStringForAlerting()) } } if ($writeOutput) { Write-Output $dbRedundancy } } function Get-DatabaseCopyForAlerting ( [string] $dbName ) { # We need to report the Database name (not the DBCopy name) return $dbName } function Get-ElapsedTimeInSeconds( [DateTime] $startTimeUtc ) { [TimeSpan]$elapsedTime = [DateTime]::UtcNow.Subtract( $startTimeUtc ) [int]$elapsedSeconds = [int][System.Math]::Floor($elapsedTime.TotalSeconds) return $elapsedSeconds } # Returns the string describing why the copy was not healthy. $null if the copy is healthy. function Get-DatabaseCopyHealth ([Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $copyStatus) { if ($copyStatus.ActiveCopy) { return Get-ActiveDatabaseCopyHealth $copyStatus } else { return Get-PassiveDatabaseCopyHealth $copyStatus } } # Returns the string describing why the copy was not healthy. $null if the copy is healthy. function Get-ActiveDatabaseCopyHealth ( ` [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $copyStatus) { $dbCopy = $copyStatus.Name $dbName = $copyStatus.DatabaseName $server = $copyStatus.MailboxServer [string]$errMsg = $null [CheckHADatabaseRedundancy.CopyCheckState]$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Unknown Log-Verbose "Get-ActiveDatabaseCopyHealth: Active copy '$dbCopy' has Status:$($copyStatus.Status), ErrorEventId:$($copyStatus.ErrorEventId), `nErrorMessage: $($copyStatus.ErrorMessage), `nSuspendComment: $($copyStatus.SuspendComment)" # Log that the DB is not replicated if ($VerbosePreference -ne "SilentlyContinue") { if ( !(Is-DatabaseReplicated ($script:databasesToCheckTable[$dbName])) ) { Log-Verbose "Get-ActiveDatabaseCopyHealth: Database '$dbName' is NOT replicated. It has only 1 copy configured in the AD." } } # First, we need the cluster node status to be Up if (!(Is-DagServerOnline $server)) { $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed $errMsg = "Active copy '$dbCopy' is not UP according to clustering." Log-Verbose "Get-ActiveDatabaseCopyHealth: $errMsg Returning 'Failed'." return $errMsg,$checkState } # If replay service is down, we'll just assume that this copy isn't healthy since # all the passives will be "stalled", which means we're anyway down to at most 1 copy. if ($copyStatus.Status -eq $CopyStatusType::ServiceDown) { $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed $errMsg = "Active copy '$dbCopy' has replay service down. Assuming the copy is unhealthy." Log-Verbose "Get-ActiveDatabaseCopyHealth: $errMsg Returning 'Failed'." return $errMsg,$checkState } if ( ($copyStatus.Status -eq $CopyStatusType::Dismounted) ` -or ($copyStatus.Status -eq $CopyStatusType::Dismounting) ) { # There may have been a permanent failure (such as a DB corruption) that is preventing # the DB from mounting. If so, there will be an error message recorded. if ($copyStatus.ErrorMessage) { $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed $errMsg = "Active copy '$dbCopy' is dismounted with an error. Error: $($copyStatus.ErrorMessage)." Log-Verbose "Get-ActiveDatabaseCopyHealth: $errMsg Returning 'Failed'." return $errMsg,$checkState } else { # A dismounted copy is fine $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Passed return $errMsg,$checkState } } if ($copyStatus.Status -eq $CopyStatusType::Mounted) { $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Passed return $errMsg,$checkState } elseif ($copyStatus.Status -eq $CopyStatusType::Mounting) { # NOTE: This is only a warning and doesn't cause RED alert to go off! $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Warning $errMsg = "Active copy '$dbCopy' is in 'Mounting' state." Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Warning'." return $errMsg,$checkState } # Any other state, assume the worst $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed $errMsg = "Active copy '$dbCopy' has some unknown/unhealthy state. Status: $($copyStatus.Status)." Log-Verbose "Get-ActiveDatabaseCopyHealth: $errMsg Returning 'Failed'." return $errMsg,$checkState } # Returns the string describing why the copy was not healthy. $null if the copy is healthy. function Get-PassiveDatabaseCopyHealth ( ` [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry] $copyStatus) { $dbCopy = $copyStatus.Name $dbName = $copyStatus.DatabaseName $server = $copyStatus.MailboxServer $activeServer = $copyStatus.ActiveDatabaseCopy [string]$errMsg = $null [CheckHADatabaseRedundancy.CopyCheckState]$checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Unknown Log-Verbose "Get-PassiveDatabaseCopyHealth: Passive copy '$dbCopy' has Status:$($copyStatus.Status), CopyQueueLength=$($copyStatus.CopyQueueLength), ReplayQueueLength=$($copyStatus.ReplayQueueLength), DatabaseName=$dbName, ErrorEventId:$($copyStatus.ErrorEventId), `nErrorMessage: $($copyStatus.ErrorMessage), `nSuspendComment: $($copyStatus.SuspendComment)" # First, we need the cluster node status to be Up if (!(Is-DagServerOnline $server)) { $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed $errMsg = "Passive copy '$dbCopy' is not UP according to clustering." Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'." return $errMsg,$checkState } # Rule out the obviously unhealthy cases first if (!(Is-PassiveCopyPossiblyHealthy $copyStatus)) { $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed $errMsg = "Passive copy '$dbCopy' is not in a good state. Status: $($copyStatus.Status)." Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'." return $errMsg,$checkState } # Check the *real* copy queue length first (i.e. not including the inspector queue) [int]$realCopyQ = [Math]::Max(0, $copyStatus.LastLogGenerated - $copyStatus.LastLogCopied) if ($realCopyQ -gt $CopyQueueLengthThreshold) { $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed $errMsg = "Passive copy '$dbCopy' has actual log copy queue higher than the threshold of '$CopyQueueLengthThreshold'. Copy queue: $realCopyQ." Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'." return $errMsg,$checkState } # Check the inspector queue length [int]$inspectorQ = [Math]::Max(0, $copyStatus.LastLogCopied - $copyStatus.LastLogInspected) if ($inspectorQ -gt $InspectorQueueLengthFailedThreshold) { $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed $errMsg = "Passive copy '$dbCopy' has an inspector queue higher than the failure threshold of '$InspectorQueueLengthFailedThreshold'. Inspector queue: $inspectorQ." Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'." return $errMsg,$checkState } # Even if the copy queue is small, we can't trust it because the active replay service might # be down, in which case the queues will be stale. (E14# 138911) # So, if the active status is "ServiceDown", but the node is up, we can be fairly certain # that we shouldn't trust the queues. if ((Is-ActiveReplayServiceDown $dbName) -and ` (Is-DagServerOnline $activeServer)) { $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Failed $errMsg = "Passive copy '$dbCopy' has a small copy queue length, but it could be stale. The active replay service on server '$activeServer' appears to be down. Copy queue: $($copyStatus.CopyQueueLength)." Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Failed'." return $errMsg,$checkState } if ($inspectorQ -gt $InspectorQueueLengthWarningThreshold) { # NOTE: This is only a warning and doesn't cause RED alert to go off! $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Warning $errMsg = "Passive copy '$dbCopy' has an inspector queue higher than the warning threshold of '$InspectorQueueLengthWarningThreshold'. Inspector queue: $inspectorQ." Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Warning'." return $errMsg,$checkState } # Check the replay queue length [int]$replayQ = [Math]::Max(0, $copyStatus.LastLogInspected - $copyStatus.LastLogReplayed) if ($replayQ -gt $ReplayQueueLengthWarningThreshold) { # NOTE: This is only a warning and doesn't cause RED alert to go off! $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Warning $errMsg = "Passive copy '$dbCopy' has a replay queue higher than the warning threshold of '$ReplayQueueLengthWarningThreshold'. Replay queue: $replayQ." Log-Verbose "Get-PassiveDatabaseCopyHealth: $errMsg Returning 'Warning'." return $errMsg,$checkState } # The copy is good in the time alotted... $checkState = [CheckHADatabaseRedundancy.CopyCheckState]::Passed return $errMsg,$checkState } # Given a list of databases, find the set of unique servers hosting copies of all of them. # Returns an array with all the server names. function Get-ServersForDatabases ( ` [Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase[]] $databases) { $servers = @{} Foreach ($db in $databases) { $db.Servers | % { ` if (!$servers.Contains($_.Name)) { $servers.Add($_.Name, 1); } } } # convert the hashtable into an array [String[]]$serversList = @() $servers.Keys | % { $serversList += $_ } Log-Verbose "Get-ServersForDatabases: returning '$($serversList.Length)' servers." return $serversList; } # Runs get-mailboxdatabasecopystatus against copy(ies) on a server and returns an array of status results. # Return type: Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[] function Get-CopyStatusFromServer ([string] $server, [string] $ParameterSetName) { Log-Verbose "Get-CopyStatusFromServer( $server ): Entering..." $script:copyStatusStopwatch.Reset(); $script:copyStatusStopwatch.Start(); [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @() if ($ParameterSetName -eq "Database" ) { $statuses = @( Get-MailboxDatabaseCopyStatus "$MailboxDatabaseName\$server" ) } else { $statuses = Get-MailboxDatabaseCopyStatus -Server $server } $script:copyStatusStopwatch.Stop(); Log-Verbose "Get-CopyStatusFromServer( $server ): operation completed in $($script:copyStatusStopwatch.Elapsed.TotalMilliseconds) ms." return $statuses } # Synchronously executes get-mdbcs against all the specified servers and returns an # array of type DatabaseCopyStatusEntry, which holds all the statuses returned. function Get-CopyStatusFromAllServers ([String[]] $servers, [string] $ParameterSetName) { Log-Verbose "Get-CopyStatusFromAllServers: Entering..." $script:copyStatusAllStopwatch.Reset(); $script:copyStatusAllStopwatch.Start(); [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $allStatuses = @() Foreach ($server in $servers) { [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @() $statuses = Get-CopyStatusFromServer $server $ParameterSetName $allStatuses += $statuses } $script:copyStatusAllStopwatch.Stop(); Log-Verbose "Get-CopyStatusFromAllServers: operation completed in $($script:copyStatusAllStopwatch.Elapsed.TotalMilliseconds) ms." return $allStatuses } # Wouldn't it be nice to be able to run get-mdbcs in parallel across all the servers? # Unfortunately, when I tested this method against just 3 servers in the DAG, it took # ~26 seconds (25854.3543ms) !!! # $REVIEW: Is Start-Job supposed to be that slow? I suppose it could be faster if we # reused the same PSSession. I'll look into "Invoke-Command -AsJob" in future, but for # now, the overhead is definitely not worth it... function Get-CopyStatusFromAllServersAsync ([String[]] $servers) { # We'll run Get-CopyStatus in parallel across all the servers so that we don't # excessively slow down the status retrieval in case some servers are down. [System.Management.Automation.PSRemotingJob[]] $asyncJobs = @() $getStatusCmd = { Process { $tmpServer = $_; if (! (Get-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010 -ErrorAction:SilentlyContinue) ) { Add-PSSnapin Microsoft.Exchange.Management.PowerShell.E2010 } [Microsoft.Exchange.Management.SystemConfigurationTasks.DatabaseCopyStatusEntry[]] $statuses = @() if ($PSCmdlet.ParameterSetName -eq "Database" ) { $statuses = @( Get-MailboxDatabaseCopyStatus "$MailboxDatabaseName\$tmpServer" ) } else { $statuses = @( Get-MailboxDatabaseCopyStatus -Server $tmpServer ) } return $statuses } } $sw = New-Object -TypeName System.Diagnostics.Stopwatch $sw.Reset(); $sw.Start(); ###### ## <Timed portion> Foreach ($server in $servers) { $asyncJobs += Start-Job -ScriptBlock $getStatusCmd -InputObject $server } Log-Verbose "Get-CopyStatusFromAllServersAsync: Started $($asyncJobs.Length) async jobs." # wait on all of them to complete Wait-Job $asyncJobs $sw.Stop(); ## </Timed portion> ###### Log-Verbose "Get-CopyStatusFromAllServersAsync: Async operations completed in $($sw.Elapsed.TotalMilliseconds) ms." Foreach ($job in $asyncJobs) { $results = Receive-Job $job Log-Verbose "`$results = $results" $results } } ####################################################################### # Dynamic code compiler logic ####################################################################### function ConstructReferences([Array]$References) { # # Build up a compiler params object... $refs = @() $refs.AddRange( @("${framework}\System.dll", "${framework}\system.windows.forms.dll", "${framework}\System.data.dll", "${framework}\System.Drawing.dll", "${framework}\System.Xml.dll")) if (($References -ne $null) -and ($References.Count -ge 1)) { foreach ($refAssembly in $References) { [string] $refTmp = $refAssembly if ($refTmp.IndexOf("\") -eq -1) { $refTmp = "${framework}\$refTmp" } $refs.Add($refTmp); } } return $refs } # Compile the types to be used for tracking the Database Redundancy state. # The compilation is only performed once per runspace and is entirely in memory. function Prepare-DatabaseRedundancyEntryDefinition { $code = ' using System; using System.Collections.Generic; using System.Diagnostics; namespace CheckHADatabaseRedundancy { public enum AlertState : int { Unknown = 0, Green, Red } // Enum describing the state of an individual database copy. At the moment, // both Passed and Warning are treated as having passed the redundancy check // and hence CurrentRedundancyCount is incremented. public enum CopyCheckState : int { Unknown = 0, Passed, Warning, Failed } public class DatabaseRedundancyEntry { public class ErrorRecord { public DateTime ErrorTime { get; set; } public string[] ErrorMessages { get; set; } public string GetErrorStringForAlerting() { if (ErrorMessages == null || ErrorMessages.Length == 0) { return String.Empty; } return String.Join("\n", ErrorMessages); } } public string DatabaseName { get; set; } public int LastRedundancyCount { get; set; } public int CurrentRedundancyCount { get; set; } public AlertState LastState { get; set; } public AlertState CurrentState { get; set; } public DateTime? LastStateTransitionUtc { get; set; } public DateTime? LastGreenTransitionUtc { get; set; } public DateTime? LastRedTransitionUtc { get; set; } public DateTime? LastGreenReportedUtc { get; set; } public DateTime? LastRedReportedUtc { get; set; } // the previous total red duration (not counting the current stretch of reds) public TimeSpan PreviousTotalRedDuration { get; set; } public TimeSpan TotalRedDuration { get { if (this.CurrentState == AlertState.Red) { // count the current duration of reds return this.PreviousTotalRedDuration + (DateTime.UtcNow - this.LastRedTransitionUtc.Value); } else { return this.PreviousTotalRedDuration; } } } public bool IsTransitioningState { get { return LastState != CurrentState; } } public bool HasErrorsInHistory { get { if (this.ErrorHistory == null || this.ErrorHistory.Count == 0) { return false; } return true; } } public string[] CurrentErrorMessages { get; set; } public List<ErrorRecord> ErrorHistory { get; private set; } public string GetErrorStringForAlerting() { if (CurrentErrorMessages == null || CurrentErrorMessages.Length == 0) { return String.Empty; } return String.Join("\n", CurrentErrorMessages); } // Create a copy of the errorMessages array and then add the record to the history public void AddErrorRecordToHistory(DateTime errorTime, string[] errorMessages) { string[] tmpMessages = new string[errorMessages.Length]; errorMessages.CopyTo(tmpMessages, 0); ErrorRecord er = new ErrorRecord(); er.ErrorTime = errorTime; er.ErrorMessages = tmpMessages; if (this.ErrorHistory == null) { this.ErrorHistory = new List<ErrorRecord>(15); } this.ErrorHistory.Add(er); } } public static class EventLogger { public static void WriteLocalizedEvent( string logName, // eg: Application string sourceName, // eg: MSExchangeRepl long eventId, // Message resource ID: eg: (long)0xC0041011 int categoryId, // category of the event EventLogEntryType entryType, // error, information, warning byte[] data, params object[] messageArgs) { EventLog eventLog = new EventLog(logName, Environment.MachineName, sourceName); EventInstance instance = new EventInstance( eventId, categoryId, entryType); eventLog.WriteEvent(instance, data, messageArgs); } } }' $checkCompiledCmd = { # Check if the type is loaded. If not, a RuntimeException is thrown. [CheckHADatabaseRedundancy.AlertState]; } [bool]$isCompiled = TryExecute-ScriptBlock -runCommand:$checkCompiledCmd -silentOnErrors:$true if (!$isCompiled) { ################################################################## # So now we compile the code and use .NET object access to run it. ################################################################## Log-Verbose "Compiling code..." Add-Type -TypeDefinition $code -Language "CSharpVersion3" Log-Verbose "Done!" } } function Write-HAAppLogInformationEvent( [Parameter(Mandatory=$true)] [string] $eventId, # eg: "C0041011" [Parameter(Mandatory=$true)] [int] $categoryId, [Object[]] $messageArgs) { Write-LocalizedEventLog "Application" "MSExchangeRepl" $eventId $categoryId ` "Information" $null $messageArgs } function Write-HAAppLogWarningEvent( [Parameter(Mandatory=$true)] [string] $eventId, # eg: "C0041011" [Parameter(Mandatory=$true)] [int] $categoryId, [Object[]] $messageArgs) { Write-LocalizedEventLog "Application" "MSExchangeRepl" $eventId $categoryId ` "Warning" $null $messageArgs } function Write-HAAppLogErrorEvent( [Parameter(Mandatory=$true)] [string] $eventId, # eg: "C0041011" [Parameter(Mandatory=$true)] [int] $categoryId, [Object[]] $messageArgs) { Write-LocalizedEventLog "Application" "MSExchangeRepl" $eventId $categoryId ` "Error" $null $messageArgs } function Write-LocalizedEventLog( [Parameter(Mandatory=$true)] [string] $logName, [Parameter(Mandatory=$true)] [string] $sourceName, [Parameter(Mandatory=$true)] [string] $eventId, # eg: "C0041011" [Parameter(Mandatory=$true)] [int] $categoryId, [Parameter(Mandatory=$true)] [System.Diagnostics.EventLogEntryType] $entryType, [Byte[]] $data, [Object[]] $messageArgs) { # parse the eventId into an Int64 first. [Int64]$id = [Int64]::Parse($eventId, [System.Globalization.NumberStyles]::HexNumber) [CheckHADatabaseRedundancy.EventLogger]::WriteLocalizedEvent( ` $logName, $sourceName, $id , $categoryId, $entryType, $data, $messageArgs) } # Common function to run a scriptblock, log any error that occurred, and return # a boolean to indicate whether it was successful or not. # NOTE: ErrorActionPreference of "Stop" is used to catch all errors. # # Optional parameters: # # cleanupCommand # This scriptblock will be executed with ErrorActionPreference of "Continue", # if an error occurred while running $runCommand. # # throwOnError # If true, the error from $runCommand will be rethrown. Otherwise 'false' is returned on error. # # silentOnErrors # If true, the error from $runCommand will not be logged via Log-ErrorRecord (i.e. Write-Error) function TryExecute-ScriptBlock ([ScriptBlock]$runCommand, [ScriptBlock]$cleanupCommand={}, [bool]$throwOnError=$false, [bool]$silentOnErrors=$false) { # Run the following in a separate script block so that we can change # ErrorActionPerefence without affecting the rest of the script. &{ $ErrorActionPreference = "Stop" [bool]$success = $false; try { $ignoredObjects = @(&$runCommand) $success = $true; } catch { # Any error will end up in this catch block # For some reason, PS does not write out any errors unless I use this # scriptblock with "Continue" ErrorActionPreference. &{ $ErrorActionPreference = "Continue" if (!$silentOnErrors) { Log-ErrorRecord $_ } # Run the cleanup scriptblock $ignoredObjects = @(&$cleanupCommand) } if ($throwOnError) { throw } } finally { # Curious PS behavior: It appears that 'return' trumps 'throw', so don't return... if (!$throwOnError -or $success) { return $success } } } } # Sleep for the specified duration (in seconds) function Sleep-ForSeconds ( [int]$sleepSecs ) { Log-Verbose "Sleeping for $sleepSecs seconds..." Start-Sleep $sleepSecs } # Common function to retrieve the current UTC time string function Get-CurrentTimeString { return [DateTime]::UtcNow.ToString("[HH:mm:ss.fff UTC]") } # Common function for verbose logging function Log-Verbose ( [string]$msg ) { $timeStamp = Get-CurrentTimeString Write-Verbose "$timeStamp $msg" } # Common function for warning logging function Log-Warning ( [string]$msg ) { $timeStamp = Get-CurrentTimeString Write-Warning "$timeStamp $msg" } # Common function for error logging function Log-Error ( [string]$msg, [switch]$Stop) { $timeStamp = Get-CurrentTimeString if (!$Stop) { Write-Error "$timeStamp $msg" } else { Write-Error "$timeStamp $msg" -ErrorAction:Stop } } # Common function for logging an error, given an ErrorRecord function Log-ErrorRecord( [System.Management.Automation.ErrorRecord] $errRecord, [switch]$Stop ) { # Trim the message so it will not display the "ErrorActionPreference is set to Stop" message # $failedMessage = $errRecord.ToString() if ($failedMessage.IndexOf("ErrorActionPreference") -ne -1) { $failedMessage = $failedMessage.Substring($failedMessage.IndexOf("set to Stop: ") + 13) } $failedMessage = $failedMessage -replace "`r" $failedMessage = $failedMessage -replace "`n" $failedCommand = $errRecord.InvocationInfo.MyCommand Log-Error "Failed at command '$failedCommand' with '$failedMessage'" -Stop:$Stop } # Shuffles objects coming from the input pipeline. # NOTE: This method only works when invoked via a pipeline. function Shuffle-Objects( [Parameter(Mandatory=$true,ValueFromPipeline=$true)] $inputData ) { Begin { $inputDataList = @() } Process { # build the input list first $inputDataList += $inputData } End { # now shuffle the contents of the input list $len = $inputDataList.Length for ([int] $i = 0; $i -lt $len; $i++) { # pick the next random number [int]$randomIndex = Get-Random -Minimum:$i -Maximum:$len # swap the values $temp = $inputDataList[$i] $inputDataList[$i] = $inputDataList[$randomIndex] $inputDataList[$randomIndex] = $temp } foreach ($element in $inputDataList) { # send each element to the output pipeline $element } } } # This will send a mail message to the specified recipients. # # Based on Send-Mail function from DatacenterHealthCommonLibrary.ps1 (service engineering scripts). # # Here we will create two SMTP clients: (1) datacenter client is configured to send mails from # production datacenter environment (based on Send-Mail function from DatacenterHealthCommonLibrary.ps1), # (2) CorpNet client is configured to send mails from Topobuilder machines inside a CORPNET. # function Send-HANotificationMail( [string]$title, [string]$body, [string[]]$attachments, [string]$from, [string[]]$tos, [string[]]$ccs, [string]$pri = "Normal", [int]$maxRetryAttempts = 2) { if ($script:IsDataCenterLibraryPresent) { Log-Verbose "Calling DataCenter send-mail function..." Set-StrictMode -Off send-mail -title:$title -body:$body -from:$from -tos:$tos -ccs:$ccs -attachments:$attachments -pri:$pri Set-StrictMode -Version 2.0 return } else { Log-Verbose "Calling Send-HANotificationMailCorpHub..." [bool]$sent = Send-HANotificationMailCorpHub -title:$title -body:$body -attachments:$attachments -from:$from ` -tos:$tos -ccs:$ccs -pri:$pri -maxRetryAttempts:$maxRetryAttempts if (!$sent){ Log-Error "Send mail failed!" } } } function Get-HubServers { Get-ExchangeServer | where { $_.IsHubTransportServer } } # Build a list of SMTP clients that can send mail to a local hub server. # function Build-HubSmtpClients { #FUTURE: We should return a list of hostnames and port and let the caller iterate over them if failure... # Also, try to choose a server in the same site as we are running # In production we will use the "send-mail" function provided by the svc engineering team $hubServers = (Get-HubServers | Shuffle-Objects ) if (!$hubServers) { Log-Error "No Hub Server found!" return } foreach ($hubServer in @($hubServers)) { $smtpClient = New-Object System.Net.Mail.SmtpClient($hubServer.Fqdn) $smtpClient.UseDefaultCredentials = $true Write-Output $smtpClient } } function Get-SmtpClients () { Log-Verbose "Entering Get-SmtpClients:" $clients = @() $clients += Build-HubSmtpClients return $clients } # Build a Mail message # function Build-MailMsg( [string]$title, [string]$body, [string[]]$attachments, [string]$from, [string[]]$tos, [string[]]$ccs, [string]$pri = "Normal") { $mailMessage = New-Object System.Net.Mail.MailMessage $mailMessage.Body = $body $mailMessage.Priority = $pri $mailMessage.Subject = $title $mailMessage.From = New-Object System.Net.Mail.MailAddress($from); # Add attachments if ($attachments) { foreach ($attachment in @($attachments)) { if ( Test-Path $attachment ) { $data = New-Object System.Net.Mail.Attachment -ArgumentList $attachment, 'Application/Octet' [void]$mailMessage.Attachments.Add($data); } } } foreach ($to in @($tos)) { [void]$mailMessage.To.Add($to) } if ($ccs) { foreach ($cc in @($ccs)) { [void]$mailMessage.CC.Add($cc) } } return $mailMessage } # Send an email message # Return $true if an SMTP host was contacted and the mail transmitted. # There is no guarantee that the mail will get through. # This is cloned from Send-NotificationMail in DatacenterSvcEngCommonLibrary.ps1, but made simpler # so I could add function and simplify it at the same time. # function Send-HANotificationMailCorpHub( [string]$title, [string]$body, [string[]]$attachments, [string]$from, [string[]]$tos, [string[]]$ccs, [string]$pri = "Normal", [int]$maxRetryAttempts = 2) { Log-Verbose "Entering Send-HANotificationMailCorpHub: `$from=$from, `$pri=$pri" $clients = @(Get-SmtpClients) if (!$clients) { Log-Error "Get-SmtpClients failed!" return $false } $mailMessage = Build-MailMsg -title $title -body $body -attachments $attachments ` -from $from -tos $tos -ccs $ccs -pri $pri if (!$mailMessage) { Log-Error "Build-MailMsg failed!" return $false } try { foreach ($smtpClient in $clients) { # Change the timeout for synchronous Send() call to 30 secs $smtpClient.Timeout = 30000 $retries = 0 Log-Verbose "Sending notification mail to: $([string]::Join(';',$tos))" Log-Verbose "Using SMTP client for '$($smtpClient.Host)', port=$($smtpClient.Port)" Log-Verbose "Sending mail from '$from'..." do { try { $success = $true $smtpClient.Send($mailMessage) Log-Verbose "Mail sent!" return $true } catch { $success = $false $retries++ if ($retries -eq $maxRetryAttempts) { Log-Verbose "Exceeded $maxRetryAttempts retries sending mail to $tos." } else { Log-Verbose "Retrying to send mail to $tos." } } } while ((-not $success) -and $retries -lt $maxRetryAttempts) Log-Verbose "Send failed. Trying to use a different smtp client if possible..." } } finally { $mailMessage.Dispose() } return $success } function Append-RedundancyInformation( [Parameter(Mandatory=$true,ValueFromPipeline=$true)] [CheckHADatabaseRedundancy.DatabaseRedundancyEntry] $dbState) { Process { $totalMins = $dbState.TotalRedDuration.TotalMinutes.ToString("F2") $msg = " Database : $($dbState.DatabaseName) Redundancy Count : $($dbState.CurrentRedundancyCount) Total Red Minutes : $totalMins" $script:report.AppendLine($msg) | Out-Null foreach ($errRecord in @($dbState.ErrorHistory)) { $timeStr = $errRecord.ErrorTime.ToString("HH:mm:ss.fff UTC") $msg = " $timeStr : $($errRecord.GetErrorStringForAlerting()) " $script:report.AppendLine($msg) | Out-Null } } } function Send-SummaryEmail { if (!$SendSummaryMailTos) { return } [System.Text.StringBuilder]$script:report = New-Object -TypeName System.Text.StringBuilder -ArgumentList 2048 $states = $script:databaseStateTable.Values $dbsWithOneCopy = $states | where { ` ($_.CurrentState -eq [CheckHADatabaseRedundancy.AlertState]::Red) -and ` ($_.TotalRedDuration.TotalSeconds -gt $ReportRedEventAfterDurationSecs) ` } | sort -Property DatabaseName $dbsWithErrors = $states | where { ` $_.HasErrorsInHistory -and ` ( ($_.CurrentState -ne [CheckHADatabaseRedundancy.AlertState]::Red) -or ` ($_.TotalRedDuration.TotalSeconds -le $ReportRedEventAfterDurationSecs) ` )} | sort -Property DatabaseName [int]$databasesCount = ($states | Measure-Object).Count [int]$dbsWithErrorsCount = ($dbsWithErrors | Measure-Object).Count [int]$dbsOneCopyCount = ($dbsWithOneCopy | Measure-Object).Count if ( ($dbsWithErrorsCount -eq 0) -and ($dbsOneCopyCount -eq 0) ) { # No need to send an email to report that everything is healthy Log-Verbose "Skipping sending an email report since everything is healthy." return } [string]$priority = "Normal" $dbsWithOneCopyNames = ( $dbsWithOneCopy | select -ExpandProperty DatabaseName ) $dbsWithErrorsNames = ( $dbsWithErrors | select -ExpandProperty DatabaseName ) [string]$dbsWithOneCopyNamesStr = $null [string]$dbsWithErrorsNamesStr = $null if ($dbsWithOneCopyNames) { $dbsWithOneCopyNamesStr = [string]::Join(", ", $dbsWithOneCopyNames) } if ($dbsWithErrorsNames) { $dbsWithErrorsNamesStr = [string]::Join(", ", $dbsWithErrorsNames) } [string] $msg = " *************************************** Database Redundancy Report $((Get-Date).DateTime) ***************************************" $script:report.AppendLine($msg) | Out-Null if ($PSCmdlet.ParameterSetName -eq "Database" ) { $msg = "Database : $MailboxDatabaseName" $script:report.AppendLine($msg) | Out-Null } elseif ($PSCmdlet.ParameterSetName -eq "Server" ) { $msg = "Server : $MailboxServerName" $script:report.AppendLine($msg) | Out-Null } $msg = ` "DatabaseCount : $databasesCount DbsCountWithLowRedundancy : $dbsOneCopyCount DbsCountWithErrors : $dbsWithErrorsCount DatabasesWithLowRedundancy : $dbsWithOneCopyNamesStr DatabasesWithErrors : $dbsWithErrorsNamesStr" $script:report.AppendLine($msg) | Out-Null if ($dbsOneCopyCount -gt 0) { # mark email as urgent $priority = "High" $msg = " ================================================================= Databases with low redundancy ( < $AtLeastNCopies copies) =================================================================" $script:report.AppendLine($msg) | Out-Null @($dbsWithOneCopy) | Append-RedundancyInformation } if ($dbsWithErrorsCount -gt 0) { $msg = " ================================================================= Databases with errors ( >= $AtLeastNCopies copies) =================================================================" $script:report.AppendLine($msg) | Out-Null @($dbsWithErrors) | Append-RedundancyInformation } # Create the email subject [string]$title = "DB Redundancy: " if ($script:dagName) { $title += "$($script:dagName): " } if ($PSCmdlet.ParameterSetName -eq "Database" ) { $title += "$MailboxDatabaseName - " } elseif ($PSCmdlet.ParameterSetName -eq "Server" ) { $title += "$MailboxServerName - " } if ($dbsOneCopyCount -gt 0) { $durationMins = [TimeSpan]::FromSeconds($ReportRedEventAfterDurationSecs).TotalMinutes if ($dbsOneCopyCount -eq 1) { $title += "1 DB has less than $AtLeastNCopies copies for more than $durationMins mins" } else { $title += "$dbsOneCopyCount DBs have less than $AtLeastNCopies copies for more than $durationMins mins" } } elseif ($dbsWithErrorsCount -gt 0) { if ($dbsWithErrorsCount -eq 1) { $title += "1 DB has had errors in the past hour" } else { $title += "$dbsWithErrorsCount DBs have had errors in the past hour" } } else { $title += "All DBs have been sufficiently redundant for the past hour" } # send the email Send-HANotificationMail -title:$title -body:($script:report.ToString()) -from:$SummaryMailFrom -tos:$SendSummaryMailTos -pri:$priority } ################################################################### ### Entry point for the script itself ################################################################### function RunOnce { $script:outputObjects = $null # Run each iteration of the check in a separate script block so that any errors # can be trapped and the entire script block exits. $checkCmd = { $script:outputObjects = RunOnceInternal } [bool]$success = TryExecute-ScriptBlock -runCommand $checkCmd if ($success) { Log-Verbose "Iteration $($script:iteration) of the monitoring check completed successfully." } else { Log-Error "Iteration $($script:iteration) of the monitoring check FAILED due to an error." } # send to the output pipeline $script:outputObjects } function RunOnceInternal { $script:oneIterationStopwatch.Reset() $script:oneIterationStopwatch.Start() $script:iteration++ Log-Verbose "Starting iteration $($script:iteration) of the monitoring check..." # The databases being monitored [Microsoft.Exchange.Data.Directory.SystemConfiguration.MailboxDatabase[]] $mdbs = @() # Lookup the specified database if ($PSCmdlet.ParameterSetName -eq "Database" ) { $mdb = Get-MailboxDatabase $MailboxDatabaseName if ($mdb) { if ($mdb.Recovery) { Log-Error "Database '$mdb' is a recovery database. Please specify a non-recovery database to check the health of." -Stop return } # We will check databases even if they only have 1 configured copy $mdbs += $mdb } else { Log-Verbose "Could not find database matching '$MailboxDatabaseName'." } } # Lookup all databases on the specified server elseif ($PSCmdlet.ParameterSetName -eq "Server" ) { $server = Get-MailboxServer $MailboxServerName if ($server) { Log-Verbose "Found mailbox server '$MailboxServerName'." $script:mailboxServer = $server $allMdbs = @( Get-MailboxDatabase -Server $server ) if ($SkipDatabasesRegex) { # filter out the DBs matching the regex specified Log-Verbose "Filtering out databases matching the following regex: '$SkipDatabasesRegex'" $mdbs = @( $allMdbs | where { ($_.Name -inotmatch $SkipDatabasesRegex) -and (!$_.Recovery) } ) } else { # no database name filter specified, so check against all $mdbs = @( $allMdbs | where { !$_.Recovery } ) } } else { Log-Verbose "Could not find server matching '$MailboxServerName'." } } Log-Verbose "Found $($mdbs.Length) databases..." # perform the check if ($mdbs.Length -gt 0) { Check-Databases $mdbs $PSCmdlet.ParameterSetName } else { Log-Verbose "Skipping Check-Databases since there are no databases to check!" } $script:oneIterationStopwatch.Stop() Log-Verbose "Iteration $($script:iteration) of the monitoring check completed in $($script:oneIterationStopwatch.Elapsed.TotalMilliseconds) ms" } # This function returns true if you can remove the copy without losing redundancy and false otherwise function Check-DatabaseRedundancyForCopyRemoval( [string] $databaseName = $(throw "Check-DatabaseRedundancyForCopyRemoval: databaseName is required."), [string] $serverName = $(throw "Check-DatabaseRedundancyForCopyRemoval: serverName is required.")) { if ( -not $databaseName ) { throw "Check-DatabaseRedundancyForCopyRemoval: databaseName cannot be empty." } if ( -not $serverName ) { throw "Check-DatabaseRedundancyForCopyRemoval: serverName cannot be empty." } $MailboxDatabaseName = $databaseName $SkipEventLogging = $true $MonitoringContext = $false $databases = @( Get-MailboxDatabase $MailboxDatabaseName ) Populate-DatabasesTable $databases # find the servers to check copy statuses on [String[]]$servers = Get-ServersForDatabases $databases if ($servers.Length -lt 2) { # Normally we should not get here, since we're only checking replicated DBs, which means # we should have at least 2 distinct servers. However, this can happen if copies are # removed while this script is running. Log-Warning "Check-DatabaseRedundancyForCopyRemoval: Get-ServersForDatabases returned only '$($servers.Length)' servers." } # get the status results and index them by database name $script:databaseToStatusTable.Clear() $script:databaseToStatusTable = Get-CopyStatusFromAllServers $servers "Database" | ` Group-Object -AsHashTable -Property DatabaseName # look up the cluster node status for the DAG Populate-ClusterNodeStatus # Simulate copy removal [UInt32] $regularAtLeastNCopies = $AtLeastNCopies [bool] $foundCopy = $false for ($i = 0; $i -lt $script:databaseToStatusTable[$MailboxDatabaseName].Count; $i++) { if ( $script:databaseToStatusTable[$MailboxDatabaseName][$i].Name -eq "$MailboxDatabaseName\$serverName" ) { if ( $script:databaseToStatusTable[$MailboxDatabaseName][$i].ActiveCopy ) { $AtLeastNCopies = $regularAtLeastNCopies + 1 Log-Verbose "Check-DatabaseRedundancyForCopyRemoval: Active copy $MailboxDatabaseName\$serverName will be removed. Redundancy count should be at least $AtLeastNCopies for active copy removal." } else { Log-Verbose "Check-DatabaseRedundancyForCopyRemoval: Passive copy $MailboxDatabaseName\$serverName will be removed." $script:databaseToStatusTable[$MailboxDatabaseName].RemoveAt($i) } $foundCopy = $true break } } if ( -not $foundCopy ) { throw "Check-DatabaseRedundancyForCopyRemoval: Copy $MailboxDatabaseName\$serverName was not found." } $status = Check-DatabaseRedundancy $MailboxDatabaseName [bool] $result = $status.CurrentState -eq "Green" $AtLeastNCopies = $regularAtLeastNCopies return $result } function Main { if ($PSCmdlet.ParameterSetName -eq "DotSourceMode") { Log-Verbose "Script run with -DotSourceMode. Exiting." return } # Ensure this table is cleared at script startup # Other hashtables get cleared every iteration of RunOnce. $script:databaseStateTable.Clear() # Validate the email parameters if ($SendSummaryMailTos -and !$SummaryMailFrom) { Log-Error "Please specify a -SummaryMailFrom address as well when -SendSummaryMailTos is used." # Let monitoring continue anyway... } if (!$MonitoringContext) { Log-Verbose "Running once." RunOnce Send-SummaryEmail return } # We are in the MonitoringContext [bool] $keepRunning = $true [System.Diagnostics.Stopwatch] $overallScriptStopwatch = New-Object -TypeName System.Diagnostics.Stopwatch $overallScriptStopwatch.Reset() $overallScriptStopwatch.Start() Log-Verbose "Running many times..." while ($keepRunning) { RunOnce # decide if we should run the next iteration if (($TerminateAfterDurationSecs -eq -1) -or ` ($TerminateAfterDurationSecs -eq 0)) { # infinite duration specified $keepRunning = $true } else { [double]$lastIterationMsecs = $script:oneIterationStopwatch.Elapsed.TotalMilliseconds [double]$timeLeftMsecs = [double]($TerminateAfterDurationSecs * 1000) - $overallScriptStopwatch.Elapsed.TotalMilliseconds # Is there enough time left for a (sleep + RunOnce) ? if ( ([double]($SleepDurationBetweenIterationsSecs * 1000) + $lastIterationMsecs) -lt $timeLeftMsecs ) { $keepRunning = $true } else { $keepRunning = $false break } } Sleep-ForSeconds $SleepDurationBetweenIterationsSecs } Send-SummaryEmail } $Command = $MyInvocation.MyCommand Log-Verbose "Starting: $($Command.Path)" # The command below is useful to see what parameters are defined in this script cmdlet. # $Command | fl Path, CommandType, Parameters, ParameterSets # Get the code compilation out of the way Prepare-DatabaseRedundancyEntryDefinition LoadExchangeSnapin # In datacenter configurations we can use libraries provided by service engineering $InstallPath = (Get-ItemProperty -path 'HKLM:SOFTWARE\Microsoft\ExchangeServer\v14\Setup').MsiInstallPath.Trim().TrimEnd("\") $DatacenterLibraryPath = "$InstallPath\DataCenter" $SvcLibaryFileName = "DatacenterHealthCommonLibrary.ps1" $ServiceCommonLib = "$DatacenterLibraryPath\$SvcLibaryFileName" $script:IsDataCenterLibraryPresent = Test-Path $ServiceCommonLib if ($script:IsDataCenterLibraryPresent) { if ($SendSummaryMailTos) { # Get a send-mail function Log-Verbose "Loading DataCenter script library '$ServiceCommonLib'" # The common lib doesn't use clean practices so we have to avoid strict mode Set-StrictMode -Off . $ServiceCommonLib Set-StrictMode -Version 2.0 } } else { Log-Verbose "File '$DatacenterLibraryPath\$SvcLibaryFileName' is not present, so skipping sending a mail." } Main