awkを使用してファイル内の特定の列でグループ化する

Question 1

$ cat tst.awk
NR==1 {
    print "<Name>", "<Nodes>", "<Count of Requests>"
    next
}
$1 != prev {
    if ( NR > 2 ) {
        prt()
    }
    prev = $1
}
{
    reqs[$3]
    numReqs += $4
}
END { prt() }

function prt(   i, n, node, nodes) {
    n = split("36 54 69 76", nodes)

    printf "%s ", prev
    for (i=1; i<=n; i++) {
        node = nodes[i]
        printf "%s ", (node in reqs ? node : "NA")
    }
    print numReqs

    delete reqs
    numReqs = 0
}

$ awk -f tst.awk file
<Name> <Nodes> <Count of Requests>
AString1 36 54 69 76 12
BString2 36 54 69 76 7
CString3 36 54 NA 76 23
DString4 36 54 69 76 31

Answer

$ cat tst.awk
NR==1 {
    print "<Name>", "<Nodes>", "<Count of Requests>"
    next
}
$1 != prev {
    if ( NR > 2 ) {
        prt()
    }
    prev = $1
}
{
    reqs[$3]
    numReqs += $4
}
END { prt() }

function prt(   i, n, node, nodes) {
    n = split("36 54 69 76", nodes)

    printf "%s ", prev
    for (i=1; i<=n; i++) {
        node = nodes[i]
        printf "%s ", (node in reqs ? node : "NA")
    }
    print numReqs

    delete reqs
    numReqs = 0
}

$ awk -f tst.awk file
<Name> <Nodes> <Count of Requests>
AString1 36 54 69 76 12
BString2 36 54 69 76 7
CString3 36 54 NA 76 23
DString4 36 54 69 76 31

Question 2

別のawkソリューションは次のとおりです。

$ awk '{if(NR==1){printf "%s\t%s\t%s\n", "<Name>","<Nodes>","<Count of Requests>"} else{nodes[$3]=1; a[$1][$3]++; sum[$1]+=$4}}END{for(string in a){ printf "%s\t", string; for(i in nodes){ (a[string][i]) ? val=i : val="NA"; printf "%s ",val }; printf "\t%d\n",sum[string];}}' file
<Name>  <Nodes> <Count of Requests>
CString3    36 54 NA 76     23
DString4    36 54 69 76     31
AString1    36 54 69 76     12
BString2    36 54 69 76     7

後で簡単に区別できるように、各項目（「名前」、「ノード」、「要求数」）の間にタブ文字を追加しました。

以下は、よりきれいな形式の同じスクリプトです（まだ端末に直接コピー/貼り付け可能）。

awk '{ 
        ## If this is the first line
        if(NR==1){
            ## Set up the nodes array with the desired values
            nodes[36]=1;
            nodes[54]=1;
            nodes[69]=1;
            nodes[76]=1; 
            ## Print the header
            printf "%s\t%s\t%s\n", "<Name>","<Nodes>","<Count of Requests>"
        } 
        ## For all except the first line
        else{
            ## Save this node in the nodes array
            nodes[$3]=1;
            ## add this node to the values found for this string
            a[$1][$3]++; 
            ## add the number of requests
            sum[$1]+=$4
        }
     }
    ## After we have finished reading the file
    END{
        ## a holds all the first fields, the various strings
        for(string in a){ 
            ## Print the current string and a tab    
            printf "%s\t", string; 
            ## For each target node
            for(i in nodes){ 
                ## If this node has a value for this string, use the value. 
                ## If it does not, use "NA".
                (a[string][i]) ? val=i : val="NA"; 
                ## Print the value for the node
                printf "%s ",val 
            }; 
            ## Print the sum
            printf "\t%d\n",sum[string];
        }
    }' file

Answer

別のawkソリューションは次のとおりです。

$ awk '{if(NR==1){printf "%s\t%s\t%s\n", "<Name>","<Nodes>","<Count of Requests>"} else{nodes[$3]=1; a[$1][$3]++; sum[$1]+=$4}}END{for(string in a){ printf "%s\t", string; for(i in nodes){ (a[string][i]) ? val=i : val="NA"; printf "%s ",val }; printf "\t%d\n",sum[string];}}' file
<Name>  <Nodes> <Count of Requests>
CString3    36 54 NA 76     23
DString4    36 54 69 76     31
AString1    36 54 69 76     12
BString2    36 54 69 76     7

後で簡単に区別できるように、各項目（「名前」、「ノード」、「要求数」）の間にタブ文字を追加しました。

以下は、よりきれいな形式の同じスクリプトです（まだ端末に直接コピー/貼り付け可能）。

awk '{ 
        ## If this is the first line
        if(NR==1){
            ## Set up the nodes array with the desired values
            nodes[36]=1;
            nodes[54]=1;
            nodes[69]=1;
            nodes[76]=1; 
            ## Print the header
            printf "%s\t%s\t%s\n", "<Name>","<Nodes>","<Count of Requests>"
        } 
        ## For all except the first line
        else{
            ## Save this node in the nodes array
            nodes[$3]=1;
            ## add this node to the values found for this string
            a[$1][$3]++; 
            ## add the number of requests
            sum[$1]+=$4
        }
     }
    ## After we have finished reading the file
    END{
        ## a holds all the first fields, the various strings
        for(string in a){ 
            ## Print the current string and a tab    
            printf "%s\t", string; 
            ## For each target node
            for(i in nodes){ 
                ## If this node has a value for this string, use the value. 
                ## If it does not, use "NA".
                (a[string][i]) ? val=i : val="NA"; 
                ## Print the value for the node
                printf "%s ",val 
            }; 
            ## Print the sum
            printf "\t%d\n",sum[string];
        }
    }' file

Question 3

2 つのインデックス配列を使用して、各名前の要求数と既存のノードを保存することができ、印刷する前に既存のノード配列を固定ノード配列と比較できます。例:

awk 'BEGIN {
       totalNodes="36 54 69 76"
     }
     NR==1 {print "<Name> <Nodes> <Count of Requests>"};
     NR>1 {
         count[$1]+=$4; nodesTmp[$1]=nodesTmp[$1]" "$3;
         }
     END{
         for (i in count) {
             if(totalNodes!=nodesTmp[i]) {
                 split(totalNodes,tmp);
                 for(j in tmp){
                     if(nodesTmp[i]!~tmp[j]){
                         nodes[i]=nodes[i]" NA"
                     }
                     else {
                         nodes[i]=nodes[i]" "tmp[j]
                     }
                 }
             }
             else{
                 nodes[i]=nodesTmp[i]
             }; 
             print i, nodes[i], count[i] | "sort"
         };
     }' temp.log

Answer

2 つのインデックス配列を使用して、各名前の要求数と既存のノードを保存することができ、印刷する前に既存のノード配列を固定ノード配列と比較できます。例:

awk 'BEGIN {
       totalNodes="36 54 69 76"
     }
     NR==1 {print "<Name> <Nodes> <Count of Requests>"};
     NR>1 {
         count[$1]+=$4; nodesTmp[$1]=nodesTmp[$1]" "$3;
         }
     END{
         for (i in count) {
             if(totalNodes!=nodesTmp[i]) {
                 split(totalNodes,tmp);
                 for(j in tmp){
                     if(nodesTmp[i]!~tmp[j]){
                         nodes[i]=nodes[i]" NA"
                     }
                     else {
                         nodes[i]=nodes[i]" "tmp[j]
                     }
                 }
             }
             else{
                 nodes[i]=nodesTmp[i]
             }; 
             print i, nodes[i], count[i] | "sort"
         };
     }' temp.log

Question 4

GNUと実際の二重インデックス配列を仮定すると、awk次のPROCINFOことが機能します。

gawk 'BEGIN{PROCINFO["sorted_in"]="@ind_str_asc"}
      FNR>1{nodes[$3]=1; total[$1]+=$4; sgl_nodes[$1][$3]=1}
      END{
        printf "<Name> <Nodes> <Count of Requests>\n"
        for (f in sgl_nodes) {
          printf "%s%s",f,OFS;
          PROCINFO["sorted_in"]="@ind_num_asc";
          for (g in nodes) {
            if (sgl_nodes[f][g]) printf "%s%s",g,OFS; else printf "NA%s",OFS
          };
          PROCINFO["sorted_in"]="@ind_str_asc";
          printf "%d%s",total[f],ORS
        }
      }' temp.log

nodesこれは、実際に発生したノード番号と2D配列の各名前に関連付けられたノード番号を配列に登録し、最初のsgl_nodesインデックス値とともに保存されます。処理中に、<Name>その値に対応する配列インデックスの下の配列の総要求数も合計されます。total<Name>

ファイルの最後に最初にヘッダー行を印刷してから、配列を繰り返してsgl_nodes各値の名前<Name>（の最初のインデックスsgl_nodes）を印刷し、登録されているすべてのノードを繰り返してnodesノード番号（特定のものが見つかった場合<Name>）を印刷します。またはNA（該当するノードが失われた場合）、最後の要求の総数。

<Name>PROCINFOそのセクションで行われた設定のおかげで、値を昇順に印刷します。BEGINこの設定は、配列を繰り返すために数値ソートで上書きされますnodes。

temp.logこの方法はソートされる項目に依存せず、自動的にノード番号を認識します。

ノード番号の1つが欠落すると、自動ノード番号認識の利点が問題になります。みんな記入。ただし、ノードのリストが固定と見なされる可能性があると述べたため、リストをハードコーディングするとこの問題を回避できます。

gawk 'BEGIN{PROCINFO["sorted_in"]="@ind_str_asc";split("36 54 69 76",nodes)}
      FNR>1{total[$1]+=$4; sgl_nodes[$1][$3]=1}
      END{
        printf "<Name> <Nodes> <Count of Requests>\n"
        for (f in sgl_nodes) {
          printf "%s%s",f,OFS;
          for (g=1;g<=4;g++) {
            if (sgl_nodes[f][nodes[g]]) printf "%s%s",nodes[g],OFS; else printf "NA%s",OFS
          };
          printf "%d%s",total[f],ORS
        }
      }' temp.log

Answer